Check for new versions at startup

2025-10-22 01:32:32 +02:00 · 2022-10-24 17:31:34 +02:00
20 changed files with 406 additions and 220 deletions
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@@ -17,9 +17,9 @@ jobs:
        python: ["3.9"]
    steps:

-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v2

-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python }}

@@ -45,7 +45,7 @@ jobs:
        run: mv dist/pferd* dist/pferd-${{ matrix.os }}

      - name: Upload binary
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v2
        with:
          name: Binaries
          path: dist/pferd-${{ matrix.os }}
@@ -57,7 +57,7 @@ jobs:
    steps:

      - name: Download binaries
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v2
        with:
          name: Binaries

--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 /.venv/
 /PFERD.egg-info/
 __pycache__/
+/.vscode/

 # pyinstaller
 /pferd.spec
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,8 +0,0 @@
-{
-    "files.insertFinalNewline": true,
-    "files.trimFinalNewlines": true,
-    "python.formatting.provider": "autopep8",
-    "python.linting.enabled": true,
-    "python.linting.flake8Enabled": true,
-    "python.linting.mypyEnabled": true,
-}
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,34 +23,8 @@ ambiguous situations.
 ## Unreleased

 ### Fixed
- Crawling of courses with the timeline view as the default tab
- Crawling of file and custom opencast cards
- Crawling of button cards without descriptions
-
-## 3.4.3 - 2022-11-29
-
-### Added
- Missing documentation for `forums` option
-
-### Changed
- Clear up error message shown when multiple paths are found to an element
-
-### Fixed
- IPD crawler unnecessarily appending trailing slashes
- Crawling opencast when ILIAS is set to English
-
-## 3.4.2 - 2022-10-26
-
-### Added
- Recognize and crawl content pages in cards
- Recognize and ignore surveys
-
-### Fixed
- Forum crawling crashing when a thread has no messages at all
+- Forum crawling crashing when parsing empty (= 0 messages) threads
 - Forum crawling crashing when a forum has no threads at all
- Ilias login failing in some cases
- Crawling of paginated future meetings
- IPD crawler handling of URLs without trailing slash

 ## 3.4.1 - 2022-08-17

--- a/CONFIG.md
+++ b/CONFIG.md
@@ -181,7 +181,6 @@ script once per day should be fine.
  redirect to the actual URL. Set to a negative value to disable the automatic
  redirect. (Default: `-1`)
 - `videos`: Whether to download videos. (Default: `no`)
- `forums`: Whether to download forum threads. (Default: `no`)
 - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
  `20.0`)

@@ -290,7 +289,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
 Example: `foo/bar --> baz`
 - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
 - Converts `foo/bar` into `baz`
- Converts `foo/bar/wargl` into `baz/wargl`
+- Converts `foo/bar/wargl` into `bar/wargl`

 Example: `foo/bar --> !`
 - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
--- a/PFERD/main.py
+++ b/PFERD/main.py
@@ -5,6 +5,8 @@ import os
 import sys
 from pathlib import Path

+from PFERD.update import check_for_updates
+
 from .auth import AuthLoadError
 from .cli import PARSER, ParserLoadError, load_default_section
 from .config import Config, ConfigDumpError, ConfigLoadError, ConfigOptionError
@@ -134,6 +136,11 @@ def main() -> None:
            loop.run_until_complete(asyncio.sleep(1))
            loop.close()
        else:
+            log.explain_topic("Checking for updates")
+            if not args.skip_update_check:
+                asyncio.run(check_for_updates())
+            else:
+                log.explain("Update check skipped due to configuration option")
            asyncio.run(pferd.run(args.debug_transforms))
    except (ConfigOptionError, AuthLoadError) as e:
        log.unlock()
--- a/PFERD/cli/parser.py
+++ b/PFERD/cli/parser.py
@@ -151,6 +151,11 @@ PARSER.add_argument(
    action="version",
    version=f"{NAME} {VERSION} (https://github.com/Garmelon/PFERD)",
 )
+PARSER.add_argument(
+    "--skip-update-check",
+    action="store_true",
+    help="disable automatic update checks at startup"
+)
 PARSER.add_argument(
    "--config", "-c",
    type=Path,
--- a/PFERD/crawl/crawler.py
+++ b/PFERD/crawl/crawler.py
@@ -9,6 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Ty
 from ..auth import Authenticator
 from ..config import Config, Section
 from ..deduplicator import Deduplicator
+from ..limiter import Limiter
 from ..logging import ProgressBar, log
 from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
 from ..report import MarkConflictError, MarkDuplicateError, Report
@@ -97,9 +98,10 @@ def anoncritical(f: AWrapped) -> AWrapped:


 class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
-    def __init__(self, path: PurePath):
+    def __init__(self, limiter: Limiter, path: PurePath):
        super().__init__()

+        self._limiter = limiter
        self._path = path

    @property
@@ -108,15 +110,17 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]):

    async def _on_aenter(self) -> ProgressBar:
        self._stack.callback(lambda: log.status("[bold cyan]", "Crawled", fmt_path(self._path)))
+        await self._stack.enter_async_context(self._limiter.limit_crawl())
        bar = self._stack.enter_context(log.crawl_bar("[bold bright_cyan]", "Crawling", fmt_path(self._path)))

        return bar


 class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
-    def __init__(self, fs_token: FileSinkToken, path: PurePath):
+    def __init__(self, limiter: Limiter, fs_token: FileSinkToken, path: PurePath):
        super().__init__()

+        self._limiter = limiter
        self._fs_token = fs_token
        self._path = path

@@ -125,6 +129,7 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
        return self._path

    async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]:
+        await self._stack.enter_async_context(self._limiter.limit_download())
        sink = await self._stack.enter_async_context(self._fs_token)
        # The "Downloaded ..." message is printed in the output dir, not here
        bar = self._stack.enter_context(log.download_bar("[bold bright_cyan]", "Downloading",
@@ -230,6 +235,12 @@ class Crawler(ABC):
        self.name = name
        self.error_free = True

+        self._limiter = Limiter(
+            task_limit=section.tasks(),
+            download_limit=section.downloads(),
+            task_delay=section.task_delay(),
+        )
+
        self._deduplicator = Deduplicator(section.windows_paths())
        self._transformer = Transformer(section.transform())

@@ -277,7 +288,7 @@ class Crawler(ABC):
            return None

        log.explain("Answer: Yes")
-        return CrawlToken(path)
+        return CrawlToken(self._limiter, path)

    async def download(
            self,
@@ -302,7 +313,7 @@ class Crawler(ABC):
            return None

        log.explain("Answer: Yes")
-        return DownloadToken(fs_token, path)
+        return DownloadToken(self._limiter, fs_token, path)

    async def _cleanup(self) -> None:
        log.explain_topic("Decision: Clean up files")
--- a/PFERD/crawl/http_crawler.py
+++ b/PFERD/crawl/http_crawler.py
@@ -1,9 +1,12 @@
 import asyncio
-from http.cookiejar import LWPCookieJar
+import http.cookies
+import ssl
 from pathlib import Path, PurePath
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional

-import requests
+import aiohttp
+import certifi
+from aiohttp.client import ClientTimeout

 from ..auth import Authenticator
 from ..config import Config
@@ -32,9 +35,9 @@ class HttpCrawler(Crawler):

        self._authentication_id = 0
        self._authentication_lock = asyncio.Lock()
-        self._http_timeout = section.http_timeout()  # TODO Use or remove
+        self._request_count = 0
+        self._http_timeout = section.http_timeout()

-        self._cookie_jar = LWPCookieJar()
        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
        self._shared_cookie_jar_paths: Optional[List[Path]] = None
        self._shared_auth = shared_auth
@@ -54,6 +57,7 @@ class HttpCrawler(Crawler):
        # This should reduce the amount of requests we make: If an authentication is in progress
        # all future requests wait for authentication to complete.
        async with self._authentication_lock:
+            self._request_count += 1
            return self._authentication_id

    async def authenticate(self, caller_auth_id: int) -> None:
@@ -102,13 +106,32 @@ class HttpCrawler(Crawler):

        self._shared_cookie_jar_paths.append(self._cookie_jar_path)

+    def _load_cookies_from_file(self, path: Path) -> None:
+        jar: Any = http.cookies.SimpleCookie()
+        with open(path, encoding="utf-8") as f:
+            for i, line in enumerate(f):
+                # Names of headers are case insensitive
+                if line[:11].lower() == "set-cookie:":
+                    jar.load(line[11:])
+                else:
+                    log.explain(f"Line {i} doesn't start with 'Set-Cookie:', ignoring it")
+        self._cookie_jar.update_cookies(jar)
+
+    def _save_cookies_to_file(self, path: Path) -> None:
+        jar: Any = http.cookies.SimpleCookie()
+        for morsel in self._cookie_jar:
+            jar[morsel.key] = morsel
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(jar.output(sep="\n"))
+            f.write("\n")  # A trailing newline is just common courtesy
+
    def _load_cookies(self) -> None:
        log.explain_topic("Loading cookies")

        cookie_jar_path: Optional[Path] = None

        if self._shared_cookie_jar_paths is None:
-            log.explain("Not sharing cookies")
+            log.explain("Not sharing any cookies")
            cookie_jar_path = self._cookie_jar_path
        else:
            log.explain("Sharing cookies")
@@ -131,38 +154,46 @@ class HttpCrawler(Crawler):

        log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}")
        try:
-            self._cookie_jar.load(filename=str(cookie_jar_path))
+            self._load_cookies_from_file(cookie_jar_path)
        except Exception as e:
-            log.explain(f"Failed to load cookies: {e}")
-            log.explain("Proceeding without cookies")
+            log.explain("Failed to load cookies")
+            log.explain(str(e))

    def _save_cookies(self) -> None:
        log.explain_topic("Saving cookies")

        try:
            log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}")
-            self._cookie_jar.save(filename=str(self._cookie_jar_path))
+            self._save_cookies_to_file(self._cookie_jar_path)
        except Exception as e:
-            log.warn(f"Failed to save cookies: {e}")
+            log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
+            log.warn(str(e))

    async def run(self) -> None:
        self._request_count = 0
+        self._cookie_jar = aiohttp.CookieJar()
        self._load_cookies()

-        self.session = requests.Session()
-        self.session.headers["User-Agent"] = f"{NAME}/{VERSION}"
-
-        # From the request docs: "All requests code should work out of the box
-        # with externally provided instances of CookieJar, e.g. LWPCookieJar and
-        # FileCookieJar."
-        # https://requests.readthedocs.io/en/latest/api/#requests.cookies.RequestsCookieJar
-        self.session.cookies = self._cookie_jar  # type: ignore
-
-        with self.session:
+        async with aiohttp.ClientSession(
+                headers={"User-Agent": f"{NAME}/{VERSION}"},
+                cookie_jar=self._cookie_jar,
+                connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())),
+                timeout=ClientTimeout(
+                    # 30 minutes. No download in the history of downloads was longer than 30 minutes.
+                    # This is enough to transfer a 600 MB file over a 3 Mib/s connection.
+                    # Allowing an arbitrary value could be annoying for overnight batch jobs
+                    total=15 * 60,
+                    connect=self._http_timeout,
+                    sock_connect=self._http_timeout,
+                    sock_read=self._http_timeout,
+                )
+        ) as session:
+            self.session = session
            try:
                await super().run()
            finally:
                del self.session
+        log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")

        # They are saved in authenticate, but a final save won't hurt
        self._save_cookies()
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@@ -24,7 +24,6 @@ class IliasElementType(Enum):
    LINK = "link"
    BOOKING = "booking"
    MEETING = "meeting"
-    SURVEY = "survey"
    VIDEO = "video"
    VIDEO_PLAYER = "video_player"
    VIDEO_FOLDER = "video_folder"
@@ -134,7 +133,7 @@ class IliasPage:

        thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]

-        form_data: Dict[str, Union[str, List[str]]] = {
+        form_data: Dict[str, Union[str, List[ſtr]]] = {
            "thread_ids[]": thread_ids,
            "selected_cmd2": "html",
            "select_cmd2": "Ausführen",
@@ -158,8 +157,6 @@ class IliasPage:
        if self._contains_collapsed_future_meetings():
            log.explain("Requesting *all* future meetings")
            return self._uncollapse_future_meetings_url()
-        if not self._is_content_tab_selected():
-            return self._select_content_page_url()
        return None

    def _is_forum_page(self) -> bool:
@@ -222,27 +219,6 @@ class IliasPage:
        link = self._abs_url_from_link(element)
        return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings")

-    def _is_content_tab_selected(self) -> bool:
-        return self._select_content_page_url() is None
-
-    def _select_content_page_url(self) -> Optional[IliasPageElement]:
-        tab = self._soup.find(
-            id="tab_view_content",
-            attrs={"class": lambda x: x is not None and "active" not in x}
-        )
-        # Already selected (or not found)
-        if not tab:
-            return None
-        link = tab.find("a")
-        if link:
-            link = self._abs_url_from_link(link)
-            return IliasPageElement(IliasElementType.FOLDER, link, "select content page")
-
-        _unexpected_html_warning()
-        log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.")
-        log.warn_contd("PFERD might not find content on the course's main page.")
-        return None
-
    def _player_to_video(self) -> List[IliasPageElement]:
        # Fetch the actual video page. This is a small wrapper page initializing a javscript
        # player. Sadly we can not execute that JS. The actual video stream url is nowhere
@@ -389,7 +365,7 @@ class IliasPage:
        """
        # Video start links are marked with an "Abspielen" link
        video_links: List[Tag] = self._soup.findAll(
-            name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
+            name="a", text=re.compile(r"\s*Abspielen\s*")
        )

        results: List[IliasPageElement] = []
@@ -708,11 +684,7 @@ class IliasPage:
                "div",
                attrs={"class": lambda x: x and "caption" in x},
            )
-            caption_container = caption_parent.find_next_sibling("div")
-            if caption_container:
-                description = caption_container.getText().strip()
-            else:
-                description = None
+            description = caption_parent.find_next_sibling("div").getText().strip()

            if not type:
                _unexpected_html_warning()
@@ -742,7 +714,7 @@ class IliasPage:

        icon: Tag = card_root.select_one(".il-card-repository-head .icon")

-        if "opencast" in icon["class"] or "xoct" in icon["class"]:
+        if "opencast" in icon["class"]:
            return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED
        if "exc" in icon["class"]:
            return IliasElementType.EXERCISE
@@ -758,12 +730,6 @@ class IliasPage:
            return IliasElementType.TEST
        if "fold" in icon["class"]:
            return IliasElementType.FOLDER
-        if "copa" in icon["class"]:
-            return IliasElementType.FOLDER
-        if "svy" in icon["class"]:
-            return IliasElementType.SURVEY
-        if "file" in icon["class"]:
-            return IliasElementType.FILE

        _unexpected_html_warning()
        log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@@ -126,6 +126,13 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
    return decorator


+def _wrap_io_in_warning(name: str) -> Callable[[AWrapped], AWrapped]:
+    """
+    Wraps any I/O exception in a CrawlWarning.
+    """
+    return _iorepeat(1, name)
+
+
 # Crawler control flow:
 #
 #     crawl_desktop -+
@@ -187,7 +194,7 @@ instance's greatest bottleneck.
        self._links = section.links()
        self._videos = section.videos()
        self._forums = section.forums()
-        self._visited_urls: Dict[str, PurePath] = dict()
+        self._visited_urls: Set[str] = set()

    async def _run(self) -> None:
        if isinstance(self._target, int):
@@ -219,45 +226,114 @@ instance's greatest bottleneck.
            return
        cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608

-        def ensure_is_valid_course_id(parent: Optional[IliasPageElement], soup: BeautifulSoup) -> None:
-            if parent is None and expected_id is not None:
-                perma_link_element: Tag = soup.find(id="current_perma_link")
-                if not perma_link_element or "crs_" not in perma_link_element.get("value"):
-                    raise CrawlError("Invalid course id? Didn't find anything looking like a course")
+        elements: List[IliasPageElement] = []
+        # A list as variable redefinitions are not propagated to outer scopes
+        description: List[BeautifulSoup] = []

-        await self._crawl_ilias_page(url, None, cl, ensure_is_valid_course_id)
+        @_iorepeat(3, "crawling url")
+        async def gather_elements() -> None:
+            elements.clear()
+            async with cl:
+                next_stage_url: Optional[str] = url
+                current_parent = None
+
+                # Duplicated code, but the root page is special - we want to avoid fetching it twice!
+                while next_stage_url:
+                    soup = await self._get_page(next_stage_url)
+
+                    if current_parent is None and expected_id is not None:
+                        perma_link_element: Tag = soup.find(id="current_perma_link")
+                        if not perma_link_element or "crs_" not in perma_link_element.get("value"):
+                            raise CrawlError("Invalid course id? Didn't find anything looking like a course")
+
+                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
+                    log.explain(f"URL: {next_stage_url}")
+                    page = IliasPage(soup, next_stage_url, current_parent)
+                    if next_element := page.get_next_stage_element():
+                        current_parent = next_element
+                        next_stage_url = next_element.url
+                    else:
+                        next_stage_url = None
+
+                elements.extend(page.get_child_elements())
+                if description_string := page.get_description():
+                    description.append(description_string)
+
+        # Fill up our task list with the found elements
+        await gather_elements()
+
+        if description:
+            await self._download_description(PurePath("."), description[0])
+
+        elements.sort(key=lambda e: e.id())
+
+        tasks: List[Awaitable[None]] = []
+        for element in elements:
+            if handle := await self._handle_ilias_element(PurePath("."), element):
+                tasks.append(asyncio.create_task(handle))
+
+        # And execute them
+        await self.gather(tasks)
+
+    async def _handle_ilias_page(
+        self,
+        url: str,
+        parent: IliasPageElement,
+        path: PurePath,
+    ) -> Optional[Coroutine[Any, Any, None]]:
+        maybe_cl = await self.crawl(path)
+        if not maybe_cl:
+            return None
+        return self._crawl_ilias_page(url, parent, maybe_cl)

    @anoncritical
    async def _crawl_ilias_page(
        self,
        url: str,
-        parent: Optional[IliasPageElement],
+        parent: IliasPageElement,
        cl: CrawlToken,
-        next_stage_hook: Callable[[Optional[IliasPageElement], BeautifulSoup], None] = lambda a, b: None
    ) -> None:
-        async with cl:
-            next_stage_url: Optional[str] = url
-            current_parent = parent
+        elements: List[IliasPageElement] = []
+        # A list as variable redefinitions are not propagated to outer scopes
+        description: List[BeautifulSoup] = []

-            while next_stage_url:
-                soup = await self._get_page(next_stage_url)
-                log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
-                log.explain(f"URL: {next_stage_url}")
+        @_iorepeat(3, "crawling folder")
+        async def gather_elements() -> None:
+            elements.clear()
+            async with cl:
+                next_stage_url: Optional[str] = url
+                current_parent = parent

-                next_stage_hook(current_parent, soup)
+                while next_stage_url:
+                    soup = await self._get_page(next_stage_url)
+                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
+                    log.explain(f"URL: {next_stage_url}")
+                    page = IliasPage(soup, next_stage_url, current_parent)
+                    if next_element := page.get_next_stage_element():
+                        current_parent = next_element
+                        next_stage_url = next_element.url
+                    else:
+                        next_stage_url = None

-                page = IliasPage(soup, next_stage_url, current_parent)
-                if next_element := page.get_next_stage_element():
-                    current_parent = next_element
-                    next_stage_url = next_element.url
-                else:
-                    next_stage_url = None
+                elements.extend(page.get_child_elements())
+                if description_string := page.get_description():
+                    description.append(description_string)

-        for element in sorted(page.get_child_elements(), key=lambda e: e.id()):
-            await self._handle_ilias_element(cl.path, element)
+        # Fill up our task list with the found elements
+        await gather_elements()

-        if description_string := page.get_description():
-            await self._download_description(cl.path, description_string)
+        if description:
+            await self._download_description(cl.path, description[0])
+
+        elements.sort(key=lambda e: e.id())
+
+        tasks: List[Awaitable[None]] = []
+        for element in elements:
+            if handle := await self._handle_ilias_element(cl.path, element):
+                tasks.append(asyncio.create_task(handle))
+
+        # And execute them
+        await self.gather(tasks)

    # These decorators only apply *to this method* and *NOT* to the returned
    # awaitables!
@@ -269,14 +345,12 @@ instance's greatest bottleneck.
        self,
        parent_path: PurePath,
        element: IliasPageElement,
-    ) -> None:
+    ) -> Optional[Coroutine[Any, Any, None]]:
        if element.url in self._visited_urls:
            raise CrawlWarning(
-                f"Found second path to element {element.name!r} at {element.url!r}. "
-                + f"First path: {fmt_path(self._visited_urls[element.url])}. "
-                + f"Second path: {fmt_path(parent_path)}."
+                f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
            )
-        self._visited_urls[element.url] = parent_path
+        self._visited_urls.add(element.url)

        element_path = PurePath(parent_path, element.name)

@@ -291,7 +365,7 @@ instance's greatest bottleneck.
                return None

        if element.type == IliasElementType.FILE:
-            await self._handle_file(element, element_path)
+            return await self._handle_file(element, element_path)
        elif element.type == IliasElementType.FORUM:
            if not self._forums:
                log.status(
@@ -301,36 +375,22 @@ instance's greatest bottleneck.
                    "[bright_black](enable with option 'forums')"
                )
                return None
-            await self._handle_forum(element, element_path)
+            return await self._handle_forum(element, element_path)
        elif element.type == IliasElementType.TEST:
-            log.status(
-                "[bold bright_black]",
-                "Ignored",
-                fmt_path(element_path),
-                "[bright_black](tests contain no relevant data)"
-            )
-            return None
-        elif element.type == IliasElementType.SURVEY:
-            log.status(
-                "[bold bright_black]",
-                "Ignored",
-                fmt_path(element_path),
-                "[bright_black](surveys contain no relevant data)"
-            )
+            log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
+            log.explain("Tests contain no relevant files")
+            log.explain("Answer: No")
            return None
        elif element.type == IliasElementType.LINK:
-            await self._handle_link(element, element_path)
+            return await self._handle_link(element, element_path)
        elif element.type == IliasElementType.BOOKING:
-            await self._handle_booking(element, element_path)
+            return await self._handle_booking(element, element_path)
        elif element.type == IliasElementType.VIDEO:
-            await self._handle_file(element, element_path)
+            return await self._handle_file(element, element_path)
        elif element.type == IliasElementType.VIDEO_PLAYER:
-            await self._handle_video(element, element_path)
+            return await self._handle_video(element, element_path)
        elif element.type in _DIRECTORY_PAGES:
-            maybe_cl = await self.crawl(element_path)
-            if not maybe_cl:
-                return None
-            await self._crawl_ilias_page(element.url, element, maybe_cl)
+            return await self._handle_ilias_page(element.url, element, element_path)
        else:
            # This will retry it a few times, failing everytime. It doesn't make any network
            # requests, so that's fine.
@@ -340,7 +400,7 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> None:
+    ) -> Optional[Coroutine[Any, Any, None]]:
        log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
        log.explain(f"Links type is {self._links}")

@@ -357,7 +417,7 @@ instance's greatest bottleneck.
        if not maybe_dl:
            return None

-        await self._download_link(element, link_template_maybe, maybe_dl)
+        return self._download_link(element, link_template_maybe, maybe_dl)

    @anoncritical
    @_iorepeat(3, "resolving link")
@@ -449,7 +509,7 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> None:
+    ) -> Optional[Coroutine[Any, Any, None]]:
        # Copy old mapping as it is likely still relevant
        if self.prev_report:
            self.report.add_custom_value(
@@ -475,7 +535,7 @@ instance's greatest bottleneck.

            return None

-        await self._download_video(element_path, element, maybe_dl)
+        return self._download_video(element_path, element, maybe_dl)

    def _previous_contained_videos(self, video_path: PurePath) -> List[PurePath]:
        if not self.prev_report:
@@ -557,11 +617,11 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> None:
+    ) -> Optional[Coroutine[Any, Any, None]]:
        maybe_dl = await self.download(element_path, mtime=element.mtime)
        if not maybe_dl:
            return None
-        await self._download_file(element, maybe_dl)
+        return self._download_file(element, maybe_dl)

    @anoncritical
    @_iorepeat(3, "downloading file")
@@ -604,11 +664,11 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> None:
+    ) -> Optional[Coroutine[Any, Any, None]]:
        maybe_cl = await self.crawl(element_path)
        if not maybe_cl:
            return None
-        await self._crawl_forum(element, maybe_cl)
+        return self._crawl_forum(element, maybe_cl)

    @_iorepeat(3, "crawling forum")
    @anoncritical
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@@ -2,7 +2,7 @@ import os
 import re
 from dataclasses import dataclass
 from pathlib import PurePath
-from typing import List, Optional, Pattern, Set, Tuple, Union
+from typing import Awaitable, List, Optional, Pattern, Set, Union
 from urllib.parse import urljoin

 from bs4 import BeautifulSoup, Tag
@@ -64,62 +64,67 @@ class KitIpdCrawler(HttpCrawler):
        self._file_regex = section.link_regex()

    async def _run(self) -> None:
-        cl = await self.crawl(PurePath("."))
-        if not cl:
+        maybe_cl = await self.crawl(PurePath("."))
+        if not maybe_cl:
            return

-        async with cl:
+        tasks: List[Awaitable[None]] = []
+
+        async with maybe_cl:
            for item in await self._fetch_items():
                if isinstance(item, KitIpdFolder):
-                    await self._crawl_folder(item)
+                    tasks.append(self._crawl_folder(item))
                else:
                    # Orphan files are placed in the root folder
-                    await self._download_file(PurePath("."), item)
+                    tasks.append(self._download_file(PurePath("."), item))
+
+        await self.gather(tasks)

    async def _crawl_folder(self, folder: KitIpdFolder) -> None:
        path = PurePath(folder.name)
        if not await self.crawl(path):
            return

-        for file in folder.files:
-            await self._download_file(path, file)
+        tasks = [self._download_file(path, file) for file in folder.files]
+
+        await self.gather(tasks)

    async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
        element_path = parent / file.name
-        dl = await self.download(element_path)
-        if not dl:
+        maybe_dl = await self.download(element_path)
+        if not maybe_dl:
            return

-        async with dl as (bar, sink):
+        async with maybe_dl as (bar, sink):
            await self._stream_from_url(file.url, sink, bar)

    async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
-        page, url = await self._get_page()
+        page = await self.get_page()
        elements: List[Tag] = self._find_file_links(page)
        items: Set[Union[KitIpdFile, KitIpdFolder]] = set()

        for element in elements:
            folder_label = self._find_folder_label(element)
            if folder_label:
-                folder = self._extract_folder(folder_label, url)
+                folder = self._extract_folder(folder_label)
                if folder not in items:
                    items.add(folder)
                    folder.explain()
            else:
-                file = self._extract_file(element, url)
+                file = self._extract_file(element)
                items.add(file)
                log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
                log.explain("Attributing it to root folder")

        return items

-    def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
+    def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
        files: List[KitIpdFile] = []
        name = folder_tag.getText().strip()

        container: Tag = folder_tag.findNextSibling(name="table")
        for link in self._find_file_links(container):
-            files.append(self._extract_file(link, url))
+            files.append(self._extract_file(link))

        return KitIpdFolder(name, files)

@@ -130,16 +135,16 @@ class KitIpdCrawler(HttpCrawler):
            return None
        return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))

-    def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
-        url = self._abs_url_from_link(url, link)
+    def _extract_file(self, link: Tag) -> KitIpdFile:
+        url = self._abs_url_from_link(link)
        name = os.path.basename(url)
        return KitIpdFile(name, url)

    def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
        return tag.findAll(name="a", attrs={"href": self._file_regex})

-    def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
-        return urljoin(url, link_tag.get("href"))
+    def _abs_url_from_link(self, link_tag: Tag) -> str:
+        return urljoin(self._url, link_tag.get("href"))

    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
        async with self.session.get(url, allow_redirects=False) as resp:
@@ -154,12 +159,12 @@ class KitIpdCrawler(HttpCrawler):

            sink.done()

-    async def _get_page(self) -> Tuple[BeautifulSoup, str]:
-        response = self.session.get(self._url)
-
-        # The web page for Algorithmen für Routenplanung contains some
-        # weird comments that beautifulsoup doesn't parse correctly. This
-        # hack enables those pages to be crawled, and should hopefully not
-        # cause issues on other pages.
-        content = re.sub(r"<!--.*?-->", "", response.text)
-        return soupify(content.encode("utf-8")), str(request.url)
+    async def get_page(self) -> BeautifulSoup:
+        async with self.session.get(self._url) as request:
+            # The web page for Algorithmen für Routenplanung contains some
+            # weird comments that beautifulsoup doesn't parse correctly. This
+            # hack enables those pages to be crawled, and should hopefully not
+            # cause issues on other pages.
+            content = (await request.read()).decode("utf-8")
+            content = re.sub(r"<!--.*?-->", "", content)
+            return soupify(content.encode("utf-8"))
--- a/PFERD/crawl/local_crawler.py
+++ b/PFERD/crawl/local_crawler.py
@@ -71,6 +71,8 @@ class LocalCrawler(Crawler):
        if not cl:
            return

+        tasks = []
+
        async with cl:
            await asyncio.sleep(random.uniform(
                0.5 * self._crawl_delay,
@@ -79,7 +81,9 @@ class LocalCrawler(Crawler):

            for child in path.iterdir():
                pure_child = cl.path / child.name
-                await self._crawl_path(child, pure_child)
+                tasks.append(self._crawl_path(child, pure_child))
+
+        await self.gather(tasks)

    async def _crawl_file(self, path: Path, pure: PurePath) -> None:
        stat = path.stat()
--- a/PFERD/limiter.py
+++ b/PFERD/limiter.py
@@ -0,0 +1,97 @@
+import asyncio
+import time
+from contextlib import asynccontextmanager
+from dataclasses import dataclass
+from typing import AsyncIterator, Optional
+
+
+@dataclass
+class Slot:
+    active: bool = False
+    last_left: Optional[float] = None
+
+
+class Limiter:
+    def __init__(
+            self,
+            task_limit: int,
+            download_limit: int,
+            task_delay: float
+    ):
+        if task_limit <= 0:
+            raise ValueError("task limit must be at least 1")
+        if download_limit <= 0:
+            raise ValueError("download limit must be at least 1")
+        if download_limit > task_limit:
+            raise ValueError("download limit can't be greater than task limit")
+        if task_delay < 0:
+            raise ValueError("Task delay must not be negative")
+
+        self._slots = [Slot() for _ in range(task_limit)]
+        self._downloads = download_limit
+        self._delay = task_delay
+
+        self._condition = asyncio.Condition()
+
+    def _acquire_slot(self) -> Optional[Slot]:
+        for slot in self._slots:
+            if not slot.active:
+                slot.active = True
+                return slot
+
+        return None
+
+    async def _wait_for_slot_delay(self, slot: Slot) -> None:
+        if slot.last_left is not None:
+            delay = slot.last_left + self._delay - time.time()
+            if delay > 0:
+                await asyncio.sleep(delay)
+
+    def _release_slot(self, slot: Slot) -> None:
+        slot.last_left = time.time()
+        slot.active = False
+
+    @asynccontextmanager
+    async def limit_crawl(self) -> AsyncIterator[None]:
+        slot: Slot
+        async with self._condition:
+            while True:
+                if found_slot := self._acquire_slot():
+                    slot = found_slot
+                    break
+                await self._condition.wait()
+
+        await self._wait_for_slot_delay(slot)
+
+        try:
+            yield
+        finally:
+            async with self._condition:
+                self._release_slot(slot)
+                self._condition.notify_all()
+
+    @asynccontextmanager
+    async def limit_download(self) -> AsyncIterator[None]:
+        slot: Slot
+        async with self._condition:
+            while True:
+                if self._downloads <= 0:
+                    await self._condition.wait()
+                    continue
+
+                if found_slot := self._acquire_slot():
+                    slot = found_slot
+                    self._downloads -= 1
+                    break
+
+                await self._condition.wait()
+
+        await self._wait_for_slot_delay(slot)
+
+        try:
+            yield
+        finally:
+            async with self._condition:
+                self._release_slot(slot)
+                self._downloads += 1
+                self._condition.notify_all()
--- a/PFERD/update.py
+++ b/PFERD/update.py
@@ -0,0 +1,53 @@
+from dataclasses import dataclass
+import ssl
+from typing import Optional
+import aiohttp
+import certifi
+
+from .version import NAME, VERSION
+from .logging import log
+
+
+@dataclass
+class PferdUpdate:
+    release_url: str
+    version: str
+
+
+def _build_session() -> aiohttp.ClientSession:
+    return aiohttp.ClientSession(
+        headers={"User-Agent": f"{NAME}/{VERSION}"},
+        connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())),
+        timeout=aiohttp.ClientTimeout(
+            total=15 * 60,
+            connect=10,
+            sock_connect=10,
+            sock_read=10,
+        )
+    )
+
+
+async def check_for_updates() -> None:
+    if new_version := await get_newer_version():
+        log.warn(
+            f"{NAME} version out of date. "
+            + f"You are running version {VERSION!r} but {new_version.version!r} was found on GitHub."
+        )
+        log.warn_contd(f"You can download it on GitHub: {new_version.release_url}")
+    else:
+        log.explain("No update found")
+
+
+async def get_newer_version() -> Optional[PferdUpdate]:
+    async with _build_session() as session:
+        async with session.get(
+            "https://api.github.com/repos/Garmelon/Pferd/releases/latest",
+            headers={"Accept": "application/vnd.github+json"}
+        ) as response:
+            release_information = await response.json()
+            tag_name: str = release_information["tag_name"]
+            tag_name = tag_name.removeprefix("v")
+            if VERSION == tag_name:
+                return None
+
+            return PferdUpdate(release_url=release_information["html_url"], version=tag_name)
--- a/PFERD/utils.py
+++ b/PFERD/utils.py
@@ -92,32 +92,17 @@ def url_set_query_params(url: str, params: Dict[str, str]) -> str:


 def str_path(path: PurePath) -> str:
-    """
-    Turn a path into a string, in a platform-independent way.
-
-    This function always uses "/" as path separator, even on Windows.
-    """
    if not path.parts:
        return "."
    return "/".join(path.parts)


 def fmt_path(path: PurePath) -> str:
-    """
-    Turn a path into a delimited string.
-
-    This is useful if file or directory names contain weird characters like
-    newlines, leading/trailing whitespace or unprintable characters. This way,
-    they are escaped and visible to the user.
-    """
    return repr(str_path(path))


 def fmt_real_path(path: Path) -> str:
-    """
-    Like fmt_path, but resolves the path before converting it to a string.
-    """
-    return fmt_path(path.absolute())
+    return repr(str(path.absolute()))


 class ReusableAsyncContextManager(ABC, Generic[T]):
--- a/PFERD/version.py
+++ b/PFERD/version.py
@@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.4.3"
+VERSION = "3.4.1"
--- a/README.md
+++ b/README.md
@@ -30,10 +30,7 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.

 Unofficial packages are available for:
 - [AUR](https://aur.archlinux.org/packages/pferd)
- [brew](https://formulae.brew.sh/formula/pferd)
- [conda-forge](https://github.com/conda-forge/pferd-feedstock)
 - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
- [PyPi](https://pypi.org/project/pferd)

 See also PFERD's [repology page](https://repology.org/project/pferd/versions).

--- a/scripts/setup
+++ b/scripts/setup
@@ -14,4 +14,4 @@ pip install --editable .

 # Installing tools and type hints
 pip install --upgrade mypy flake8 autopep8 isort pyinstaller
-mypy PFERD --install-types --non-interactive
+pip install --upgrade types-chardet types-certifi
--- a/setup.cfg
+++ b/setup.cfg
@@ -11,7 +11,6 @@ install_requires =
  rich>=11.0.0
  keyring>=23.5.0
  certifi>=2021.10.8
-  requests>=2.28.1

 [options.entry_points]
 console_scripts =