TODO

Switch http_crawler to requests
Fix flake8 error
2023-12-21 10:23:01 +01:00 · 2023-04-19 10:13:36 +02:00 · 2023-04-19 10:12:48 +02:00 · 2023-04-19 10:12:48 +02:00 · 2023-04-19 10:12:48 +02:00 · 2023-04-19 10:12:48 +02:00
17 changed files with 289 additions and 340 deletions
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@ -17,9 +17,9 @@ jobs:
        python: ["3.9"]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v2
+      - uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python }}
@ -45,7 +45,7 @@ jobs:
        run: mv dist/pferd* dist/pferd-${{ matrix.os }}
      - name: Upload binary
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
        with:
          name: Binaries
          path: dist/pferd-${{ matrix.os }}
@ -57,7 +57,7 @@ jobs:
    steps:
      - name: Download binaries
-        uses: actions/download-artifact@v2
+        uses: actions/download-artifact@v3
        with:
          name: Binaries
--- a/.gitignore
+++ b/.gitignore
@ -2,7 +2,6 @@
 /.venv/
 /PFERD.egg-info/
 __pycache__/
 /.vscode/
 # pyinstaller
 /pferd.spec
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,8 @@
 {
    "files.insertFinalNewline": true,
    "files.trimFinalNewlines": true,
    "python.formatting.provider": "autopep8",
    "python.linting.enabled": true,
    "python.linting.flake8Enabled": true,
    "python.linting.mypyEnabled": true,
 }
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -22,6 +22,36 @@ ambiguous situations.
 ## Unreleased
 ### Fixed
 - Crawling of courses with the timeline view as the default tab
 - Crawling of file and custom opencast cards
 - Crawling of button cards without descriptions
 ## 3.4.3 - 2022-11-29
 ### Added
 - Missing documentation for `forums` option
 ### Changed
 - Clear up error message shown when multiple paths are found to an element
 ### Fixed
 - IPD crawler unnecessarily appending trailing slashes
 - Crawling opencast when ILIAS is set to English
 ## 3.4.2 - 2022-10-26
 ### Added
 - Recognize and crawl content pages in cards
 - Recognize and ignore surveys
 ### Fixed
 - Forum crawling crashing when a thread has no messages at all
 - Forum crawling crashing when a forum has no threads at all
 - Ilias login failing in some cases
 - Crawling of paginated future meetings
 - IPD crawler handling of URLs without trailing slash
 ## 3.4.1 - 2022-08-17
 ### Added
--- a/CONFIG.md
+++ b/CONFIG.md
@ -181,6 +181,7 @@ script once per day should be fine.
  redirect to the actual URL. Set to a negative value to disable the automatic
  redirect. (Default: `-1`)
 - `videos`: Whether to download videos. (Default: `no`)
 - `forums`: Whether to download forum threads. (Default: `no`)
 - `http_timeout`: The timeout (in seconds) for all HTTP requests. (Default:
  `20.0`)
@ -289,7 +290,7 @@ path matches `SOURCE`, it is renamed to `TARGET`.
 Example: `foo/bar --> baz`
 - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
 - Converts `foo/bar` into `baz`
- Converts `foo/bar/wargl` into `bar/wargl`
+- Converts `foo/bar/wargl` into `baz/wargl`
 Example: `foo/bar --> !`
 - Doesn't match `foo`, `a/foo/bar` or `foo/baz`
--- a/PFERD/crawl/crawler.py
+++ b/PFERD/crawl/crawler.py
@ -9,7 +9,6 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Ty
 from ..auth import Authenticator
 from ..config import Config, Section
 from ..deduplicator import Deduplicator
 from ..limiter import Limiter
 from ..logging import ProgressBar, log
 from ..output_dir import FileSink, FileSinkToken, OnConflict, OutputDirectory, OutputDirError, Redownload
 from ..report import MarkConflictError, MarkDuplicateError, Report
@ -98,10 +97,9 @@ def anoncritical(f: AWrapped) -> AWrapped:
 class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
-    def __init__(self, limiter: Limiter, path: PurePath):
+    def __init__(self, path: PurePath):
        super().__init__()
        self._limiter = limiter
        self._path = path
    @property
@ -110,17 +108,15 @@ class CrawlToken(ReusableAsyncContextManager[ProgressBar]):
    async def _on_aenter(self) -> ProgressBar:
        self._stack.callback(lambda: log.status("[bold cyan]", "Crawled", fmt_path(self._path)))
        await self._stack.enter_async_context(self._limiter.limit_crawl())
        bar = self._stack.enter_context(log.crawl_bar("[bold bright_cyan]", "Crawling", fmt_path(self._path)))
        return bar
 class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
-    def __init__(self, limiter: Limiter, fs_token: FileSinkToken, path: PurePath):
+    def __init__(self, fs_token: FileSinkToken, path: PurePath):
        super().__init__()
        self._limiter = limiter
        self._fs_token = fs_token
        self._path = path
@ -129,7 +125,6 @@ class DownloadToken(ReusableAsyncContextManager[Tuple[ProgressBar, FileSink]]):
        return self._path
    async def _on_aenter(self) -> Tuple[ProgressBar, FileSink]:
        await self._stack.enter_async_context(self._limiter.limit_download())
        sink = await self._stack.enter_async_context(self._fs_token)
        # The "Downloaded ..." message is printed in the output dir, not here
        bar = self._stack.enter_context(log.download_bar("[bold bright_cyan]", "Downloading",
@ -235,12 +230,6 @@ class Crawler(ABC):
        self.name = name
        self.error_free = True
        self._limiter = Limiter(
            task_limit=section.tasks(),
            download_limit=section.downloads(),
            task_delay=section.task_delay(),
        )
        self._deduplicator = Deduplicator(section.windows_paths())
        self._transformer = Transformer(section.transform())
@ -288,7 +277,7 @@ class Crawler(ABC):
            return None
        log.explain("Answer: Yes")
-        return CrawlToken(self._limiter, path)
+        return CrawlToken(path)
    async def download(
            self,
@ -313,7 +302,7 @@ class Crawler(ABC):
            return None
        log.explain("Answer: Yes")
-        return DownloadToken(self._limiter, fs_token, path)
+        return DownloadToken(fs_token, path)
    async def _cleanup(self) -> None:
        log.explain_topic("Decision: Clean up files")
--- a/PFERD/crawl/http_crawler.py
+++ b/PFERD/crawl/http_crawler.py
@ -1,12 +1,9 @@
 import asyncio
-import http.cookies
+from http.cookiejar import LWPCookieJar
 import ssl
 from pathlib import Path, PurePath
-from typing import Any, Dict, List, Optional
+from typing import Dict, List, Optional
-import aiohttp
+import requests
 import certifi
 from aiohttp.client import ClientTimeout
 from ..auth import Authenticator
 from ..config import Config
@ -35,9 +32,9 @@ class HttpCrawler(Crawler):
        self._authentication_id = 0
        self._authentication_lock = asyncio.Lock()
-        self._request_count = 0
+        self._http_timeout = section.http_timeout()  # TODO Use or remove
        self._http_timeout = section.http_timeout()
        self._cookie_jar = LWPCookieJar()
        self._cookie_jar_path = self._output_dir.resolve(self.COOKIE_FILE)
        self._shared_cookie_jar_paths: Optional[List[Path]] = None
        self._shared_auth = shared_auth
@ -57,7 +54,6 @@ class HttpCrawler(Crawler):
        # This should reduce the amount of requests we make: If an authentication is in progress
        # all future requests wait for authentication to complete.
        async with self._authentication_lock:
            self._request_count += 1
            return self._authentication_id
    async def authenticate(self, caller_auth_id: int) -> None:
@ -106,32 +102,13 @@ class HttpCrawler(Crawler):
        self._shared_cookie_jar_paths.append(self._cookie_jar_path)
    def _load_cookies_from_file(self, path: Path) -> None:
        jar: Any = http.cookies.SimpleCookie()
        with open(path, encoding="utf-8") as f:
            for i, line in enumerate(f):
                # Names of headers are case insensitive
                if line[:11].lower() == "set-cookie:":
                    jar.load(line[11:])
                else:
                    log.explain(f"Line {i} doesn't start with 'Set-Cookie:', ignoring it")
        self._cookie_jar.update_cookies(jar)
    def _save_cookies_to_file(self, path: Path) -> None:
        jar: Any = http.cookies.SimpleCookie()
        for morsel in self._cookie_jar:
            jar[morsel.key] = morsel
        with open(path, "w", encoding="utf-8") as f:
            f.write(jar.output(sep="\n"))
            f.write("\n")  # A trailing newline is just common courtesy
    def _load_cookies(self) -> None:
        log.explain_topic("Loading cookies")
        cookie_jar_path: Optional[Path] = None
        if self._shared_cookie_jar_paths is None:
-            log.explain("Not sharing any cookies")
+            log.explain("Not sharing cookies")
            cookie_jar_path = self._cookie_jar_path
        else:
            log.explain("Sharing cookies")
@ -154,46 +131,38 @@ class HttpCrawler(Crawler):
        log.explain(f"Loading cookies from {fmt_real_path(cookie_jar_path)}")
        try:
-            self._load_cookies_from_file(cookie_jar_path)
+            self._cookie_jar.load(filename=str(cookie_jar_path))
        except Exception as e:
-            log.explain("Failed to load cookies")
+            log.explain(f"Failed to load cookies: {e}")
-            log.explain(str(e))
+            log.explain("Proceeding without cookies")
    def _save_cookies(self) -> None:
        log.explain_topic("Saving cookies")
        try:
            log.explain(f"Saving cookies to {fmt_real_path(self._cookie_jar_path)}")
-            self._save_cookies_to_file(self._cookie_jar_path)
+            self._cookie_jar.save(filename=str(self._cookie_jar_path))
        except Exception as e:
-            log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}")
+            log.warn(f"Failed to save cookies: {e}")
            log.warn(str(e))
    async def run(self) -> None:
        self._request_count = 0
        self._cookie_jar = aiohttp.CookieJar()
        self._load_cookies()
-        async with aiohttp.ClientSession(
+        self.session = requests.Session()
-                headers={"User-Agent": f"{NAME}/{VERSION}"},
+        self.session.headers["User-Agent"] = f"{NAME}/{VERSION}"
-                cookie_jar=self._cookie_jar,
+
-                connector=aiohttp.TCPConnector(ssl=ssl.create_default_context(cafile=certifi.where())),
+        # From the request docs: "All requests code should work out of the box
-                timeout=ClientTimeout(
+        # with externally provided instances of CookieJar, e.g. LWPCookieJar and
-                    # 30 minutes. No download in the history of downloads was longer than 30 minutes.
+        # FileCookieJar."
-                    # This is enough to transfer a 600 MB file over a 3 Mib/s connection.
+        # https://requests.readthedocs.io/en/latest/api/#requests.cookies.RequestsCookieJar
-                    # Allowing an arbitrary value could be annoying for overnight batch jobs
+        self.session.cookies = self._cookie_jar  # type: ignore
-                    total=15 * 60,
+
-                    connect=self._http_timeout,
+        with self.session:
                    sock_connect=self._http_timeout,
                    sock_read=self._http_timeout,
                )
        ) as session:
            self.session = session
            try:
                await super().run()
            finally:
                del self.session
        log.explain_topic(f"Total amount of HTTP requests: {self._request_count}")
        # They are saved in authenticate, but a final save won't hurt
        self._save_cookies()
--- a/PFERD/crawl/ilias/kit_ilias_html.py
+++ b/PFERD/crawl/ilias/kit_ilias_html.py
@ -24,6 +24,7 @@ class IliasElementType(Enum):
    LINK = "link"
    BOOKING = "booking"
    MEETING = "meeting"
    SURVEY = "survey"
    VIDEO = "video"
    VIDEO_PLAYER = "video_player"
    VIDEO_FOLDER = "video_folder"
@ -59,6 +60,7 @@ class IliasPageElement:
 class IliasDownloadForumData:
    url: str
    form_data: Dict[str, Union[str, List[str]]]
    empty: bool
@dataclass
@ -130,24 +132,34 @@ class IliasPage:
            return None
        post_url = self._abs_url_from_relative(form["action"])
-        form_data: Dict[str, Union[str, List[ſtr]]] = {
+        thread_ids = [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})]
-            "thread_ids[]": [f["value"] for f in form.find_all(attrs={"name": "thread_ids[]"})],
+
        form_data: Dict[str, Union[str, List[str]]] = {
            "thread_ids[]": thread_ids,
            "selected_cmd2": "html",
            "select_cmd2": "Ausführen",
            "selected_cmd": "",
        }
-        return IliasDownloadForumData(post_url, form_data)
+        return IliasDownloadForumData(url=post_url, form_data=form_data, empty=len(thread_ids) == 0)
    def get_next_stage_element(self) -> Optional[IliasPageElement]:
        if self._is_forum_page():
            if "trows=800" in self._page_url:
                return None
            log.explain("Requesting *all* forum threads")
            return self._get_show_max_forum_entries_per_page_url()
        if self._is_ilias_opencast_embedding():
            log.explain("Unwrapping opencast embedding")
            return self.get_child_elements()[0]
        if self._page_type == IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED:
            log.explain("Unwrapping video pagination")
            return self._find_video_entries_paginated()[0]
        if self._contains_collapsed_future_meetings():
            log.explain("Requesting *all* future meetings")
            return self._uncollapse_future_meetings_url()
        if not self._is_content_tab_selected():
            return self._select_content_page_url()
        return None
    def _is_forum_page(self) -> bool:
@ -200,6 +212,37 @@ class IliasPage:
            return False
        return "target=copa_" in link.get("value")
    def _contains_collapsed_future_meetings(self) -> bool:
        return self._uncollapse_future_meetings_url() is not None
    def _uncollapse_future_meetings_url(self) -> Optional[IliasPageElement]:
        element = self._soup.find("a", attrs={"href": lambda x: x and "crs_next_sess=1" in x})
        if not element:
            return None
        link = self._abs_url_from_link(element)
        return IliasPageElement(IliasElementType.FOLDER, link, "show all meetings")
    def _is_content_tab_selected(self) -> bool:
        return self._select_content_page_url() is None
    def _select_content_page_url(self) -> Optional[IliasPageElement]:
        tab = self._soup.find(
            id="tab_view_content",
            attrs={"class": lambda x: x is not None and "active" not in x}
        )
        # Already selected (or not found)
        if not tab:
            return None
        link = tab.find("a")
        if link:
            link = self._abs_url_from_link(link)
            return IliasPageElement(IliasElementType.FOLDER, link, "select content page")
        _unexpected_html_warning()
        log.warn_contd(f"Could not find content tab URL on {self._page_url!r}.")
        log.warn_contd("PFERD might not find content on the course's main page.")
        return None
    def _player_to_video(self) -> List[IliasPageElement]:
        # Fetch the actual video page. This is a small wrapper page initializing a javscript
        # player. Sadly we can not execute that JS. The actual video stream url is nowhere
@ -346,7 +389,7 @@ class IliasPage:
        """
        # Video start links are marked with an "Abspielen" link
        video_links: List[Tag] = self._soup.findAll(
-            name="a", text=re.compile(r"\s*Abspielen\s*")
+            name="a", text=re.compile(r"\s*(Abspielen|Play)\s*")
        )
        results: List[IliasPageElement] = []
@ -665,7 +708,11 @@ class IliasPage:
                "div",
                attrs={"class": lambda x: x and "caption" in x},
            )
-            description = caption_parent.find_next_sibling("div").getText().strip()
+            caption_container = caption_parent.find_next_sibling("div")
            if caption_container:
                description = caption_container.getText().strip()
            else:
                description = None
            if not type:
                _unexpected_html_warning()
@ -695,7 +742,7 @@ class IliasPage:
        icon: Tag = card_root.select_one(".il-card-repository-head .icon")
-        if "opencast" in icon["class"]:
+        if "opencast" in icon["class"] or "xoct" in icon["class"]:
            return IliasElementType.VIDEO_FOLDER_MAYBE_PAGINATED
        if "exc" in icon["class"]:
            return IliasElementType.EXERCISE
@ -711,6 +758,12 @@ class IliasPage:
            return IliasElementType.TEST
        if "fold" in icon["class"]:
            return IliasElementType.FOLDER
        if "copa" in icon["class"]:
            return IliasElementType.FOLDER
        if "svy" in icon["class"]:
            return IliasElementType.SURVEY
        if "file" in icon["class"]:
            return IliasElementType.FILE
        _unexpected_html_warning()
        log.warn_contd(f"Could not extract type from {icon} for card title {card_title}")
@ -790,6 +843,10 @@ class IliasPage:
        if img_tag is None:
            img_tag = found_parent.select_one("img.icon")
        if img_tag is None and found_parent.find("a", attrs={"href": lambda x: x and "crs_next_sess=" in x}):
            log.explain("Found session expansion button, skipping it as it has no content")
            return None
        if img_tag is None:
            _unexpected_html_warning()
            log.warn_contd(f"Tried to figure out element type, but did not find an image for {url}")
@ -937,6 +994,13 @@ def parse_ilias_forum_export(forum_export: BeautifulSoup) -> List[IliasForumThre
    for p in forum_export.select("body > p"):
        title_tag = p
        content_tag = p.find_next_sibling("ul")
        if not content_tag:
            # ILIAS allows users to delete the initial post while keeping the thread open
            # This produces empty threads without *any* content.
            # I am not sure why you would want this, but ILIAS makes it easy to do.
            continue
        title = p.find("b").text
        if ":" in title:
            title = title[title.find(":") + 1:]
--- a/PFERD/crawl/ilias/kit_ilias_web_crawler.py
+++ b/PFERD/crawl/ilias/kit_ilias_web_crawler.py
@ -23,6 +23,12 @@ from .kit_ilias_html import (IliasElementType, IliasForumThread, IliasPage, Ilia
 TargetType = Union[str, int]
 _ILIAS_URL = "https://ilias.studium.kit.edu"
 class KitShibbolethBackgroundLoginSuccessful():
    pass
 class KitIliasWebCrawlerSection(HttpCrawlerSection):
    def target(self) -> TargetType:
@ -36,7 +42,7 @@ class KitIliasWebCrawlerSection(HttpCrawlerSection):
        if target == "desktop":
            # Full personal desktop
            return target
-        if target.startswith("https://ilias.studium.kit.edu"):
+        if target.startswith(_ILIAS_URL):
            # ILIAS URL
            return target
@ -120,13 +126,6 @@ def _iorepeat(attempts: int, name: str, failure_is_error: bool = False) -> Calla
    return decorator
 def _wrap_io_in_warning(name: str) -> Callable[[AWrapped], AWrapped]:
    """
    Wraps any I/O exception in a CrawlWarning.
    """
    return _iorepeat(1, name)
 # Crawler control flow:
 #
 #     crawl_desktop -+
@ -181,14 +180,14 @@ instance's greatest bottleneck.
            section.tfa_auth(authenticators),
        )
-        self._base_url = "https://ilias.studium.kit.edu"
+        self._base_url = _ILIAS_URL
        self._target = section.target()
        self._link_file_redirect_delay = section.link_redirect_delay()
        self._links = section.links()
        self._videos = section.videos()
        self._forums = section.forums()
-        self._visited_urls: Set[str] = set()
+        self._visited_urls: Dict[str, PurePath] = dict()
    async def _run(self) -> None:
        if isinstance(self._target, int):
@ -220,105 +219,45 @@ instance's greatest bottleneck.
            return
        cl = maybe_cl  # Not mypy's fault, but explained here: https://github.com/python/mypy/issues/2608
-        elements: List[IliasPageElement] = []
+        def ensure_is_valid_course_id(parent: Optional[IliasPageElement], soup: BeautifulSoup) -> None:
-        # A list as variable redefinitions are not propagated to outer scopes
+            if parent is None and expected_id is not None:
-        description: List[BeautifulSoup] = []
+                perma_link_element: Tag = soup.find(id="current_perma_link")
                if not perma_link_element or "crs_" not in perma_link_element.get("value"):
                    raise CrawlError("Invalid course id? Didn't find anything looking like a course")
-        @_iorepeat(3, "crawling url")
+        await self._crawl_ilias_page(url, None, cl, ensure_is_valid_course_id)
        async def gather_elements() -> None:
            elements.clear()
            async with cl:
                soup = await self._get_page(url)
                if expected_id is not None:
                    perma_link_element: Tag = soup.find(id="current_perma_link")
                    if not perma_link_element or "crs_" not in perma_link_element.get("value"):
                        raise CrawlError("Invalid course id? Didn't find anything looking like a course")
                # Duplicated code, but the root page is special - we want to avoid fetching it twice!
                log.explain_topic("Parsing root HTML page")
                log.explain(f"URL: {url}")
                page = IliasPage(soup, url, None)
                elements.extend(page.get_child_elements())
                if description_string := page.get_description():
                    description.append(description_string)
        # Fill up our task list with the found elements
        await gather_elements()
        if description:
            await self._download_description(PurePath("."), description[0])
        elements.sort(key=lambda e: e.id())
        tasks: List[Awaitable[None]] = []
        for element in elements:
            if handle := await self._handle_ilias_element(PurePath("."), element):
                tasks.append(asyncio.create_task(handle))
        # And execute them
        await self.gather(tasks)
    async def _handle_ilias_page(
        self,
        url: str,
        parent: IliasPageElement,
        path: PurePath,
    ) -> Optional[Coroutine[Any, Any, None]]:
        maybe_cl = await self.crawl(path)
        if not maybe_cl:
            return None
        return self._crawl_ilias_page(url, parent, maybe_cl)
    @anoncritical
    async def _crawl_ilias_page(
        self,
        url: str,
-        parent: IliasPageElement,
+        parent: Optional[IliasPageElement],
        cl: CrawlToken,
        next_stage_hook: Callable[[Optional[IliasPageElement], BeautifulSoup], None] = lambda a, b: None
    ) -> None:
-        elements: List[IliasPageElement] = []
+        async with cl:
-        # A list as variable redefinitions are not propagated to outer scopes
+            next_stage_url: Optional[str] = url
-        description: List[BeautifulSoup] = []
+            current_parent = parent
-        @_iorepeat(3, "crawling folder")
+            while next_stage_url:
-        async def gather_elements() -> None:
+                soup = await self._get_page(next_stage_url)
-            elements.clear()
+                log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
-            async with cl:
+                log.explain(f"URL: {next_stage_url}")
                next_stage_url: Optional[str] = url
                current_parent = parent
-                while next_stage_url:
+                next_stage_hook(current_parent, soup)
                    soup = await self._get_page(next_stage_url)
                    log.explain_topic(f"Parsing HTML page for {fmt_path(cl.path)}")
                    log.explain(f"URL: {next_stage_url}")
                    page = IliasPage(soup, next_stage_url, current_parent)
                    if next_element := page.get_next_stage_element():
                        current_parent = next_element
                        next_stage_url = next_element.url
                    else:
                        next_stage_url = None
-                elements.extend(page.get_child_elements())
+                page = IliasPage(soup, next_stage_url, current_parent)
-                if description_string := page.get_description():
+                if next_element := page.get_next_stage_element():
-                    description.append(description_string)
+                    current_parent = next_element
                    next_stage_url = next_element.url
                else:
                    next_stage_url = None
-        # Fill up our task list with the found elements
+        for element in sorted(page.get_child_elements(), key=lambda e: e.id()):
-        await gather_elements()
+            await self._handle_ilias_element(cl.path, element)
-        if description:
+        if description_string := page.get_description():
-            await self._download_description(cl.path, description[0])
+            await self._download_description(cl.path, description_string)
        elements.sort(key=lambda e: e.id())
        tasks: List[Awaitable[None]] = []
        for element in elements:
            if handle := await self._handle_ilias_element(cl.path, element):
                tasks.append(asyncio.create_task(handle))
        # And execute them
        await self.gather(tasks)
    # These decorators only apply *to this method* and *NOT* to the returned
    # awaitables!
@ -330,12 +269,14 @@ instance's greatest bottleneck.
        self,
        parent_path: PurePath,
        element: IliasPageElement,
-    ) -> Optional[Coroutine[Any, Any, None]]:
+    ) -> None:
        if element.url in self._visited_urls:
            raise CrawlWarning(
-                f"Found second path to element {element.name!r} at {element.url!r}. Aborting subpath"
+                f"Found second path to element {element.name!r} at {element.url!r}. "
                + f"First path: {fmt_path(self._visited_urls[element.url])}. "
                + f"Second path: {fmt_path(parent_path)}."
            )
-        self._visited_urls.add(element.url)
+        self._visited_urls[element.url] = parent_path
        element_path = PurePath(parent_path, element.name)
@ -350,7 +291,7 @@ instance's greatest bottleneck.
                return None
        if element.type == IliasElementType.FILE:
-            return await self._handle_file(element, element_path)
+            await self._handle_file(element, element_path)
        elif element.type == IliasElementType.FORUM:
            if not self._forums:
                log.status(
@ -360,22 +301,36 @@ instance's greatest bottleneck.
                    "[bright_black](enable with option 'forums')"
                )
                return None
-            return await self._handle_forum(element, element_path)
+            await self._handle_forum(element, element_path)
        elif element.type == IliasElementType.TEST:
-            log.explain_topic(f"Decision: Crawl {fmt_path(element_path)}")
+            log.status(
-            log.explain("Tests contain no relevant files")
+                "[bold bright_black]",
-            log.explain("Answer: No")
+                "Ignored",
                fmt_path(element_path),
                "[bright_black](tests contain no relevant data)"
            )
            return None
        elif element.type == IliasElementType.SURVEY:
            log.status(
                "[bold bright_black]",
                "Ignored",
                fmt_path(element_path),
                "[bright_black](surveys contain no relevant data)"
            )
            return None
        elif element.type == IliasElementType.LINK:
-            return await self._handle_link(element, element_path)
+            await self._handle_link(element, element_path)
        elif element.type == IliasElementType.BOOKING:
-            return await self._handle_booking(element, element_path)
+            await self._handle_booking(element, element_path)
        elif element.type == IliasElementType.VIDEO:
-            return await self._handle_file(element, element_path)
+            await self._handle_file(element, element_path)
        elif element.type == IliasElementType.VIDEO_PLAYER:
-            return await self._handle_video(element, element_path)
+            await self._handle_video(element, element_path)
        elif element.type in _DIRECTORY_PAGES:
-            return await self._handle_ilias_page(element.url, element, element_path)
+            maybe_cl = await self.crawl(element_path)
            if not maybe_cl:
                return None
            await self._crawl_ilias_page(element.url, element, maybe_cl)
        else:
            # This will retry it a few times, failing everytime. It doesn't make any network
            # requests, so that's fine.
@ -385,7 +340,7 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Coroutine[Any, Any, None]]:
+    ) -> None:
        log.explain_topic(f"Decision: Crawl Link {fmt_path(element_path)}")
        log.explain(f"Links type is {self._links}")
@ -402,7 +357,7 @@ instance's greatest bottleneck.
        if not maybe_dl:
            return None
-        return self._download_link(element, link_template_maybe, maybe_dl)
+        await self._download_link(element, link_template_maybe, maybe_dl)
    @anoncritical
    @_iorepeat(3, "resolving link")
@ -494,7 +449,7 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Coroutine[Any, Any, None]]:
+    ) -> None:
        # Copy old mapping as it is likely still relevant
        if self.prev_report:
            self.report.add_custom_value(
@ -520,7 +475,7 @@ instance's greatest bottleneck.
            return None
-        return self._download_video(element_path, element, maybe_dl)
+        await self._download_video(element_path, element, maybe_dl)
    def _previous_contained_videos(self, video_path: PurePath) -> List[PurePath]:
        if not self.prev_report:
@ -602,11 +557,11 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Coroutine[Any, Any, None]]:
+    ) -> None:
        maybe_dl = await self.download(element_path, mtime=element.mtime)
        if not maybe_dl:
            return None
-        return self._download_file(element, maybe_dl)
+        await self._download_file(element, maybe_dl)
    @anoncritical
    @_iorepeat(3, "downloading file")
@ -649,16 +604,16 @@ instance's greatest bottleneck.
        self,
        element: IliasPageElement,
        element_path: PurePath,
-    ) -> Optional[Coroutine[Any, Any, None]]:
+    ) -> None:
        maybe_cl = await self.crawl(element_path)
        if not maybe_cl:
            return None
-        return self._crawl_forum(element, maybe_cl)
+        await self._crawl_forum(element, maybe_cl)
    @_iorepeat(3, "crawling forum")
    @anoncritical
    async def _crawl_forum(self, element: IliasPageElement, cl: CrawlToken) -> None:
-        elements = []
+        elements: List[IliasForumThread] = []
        async with cl:
            next_stage_url = element.url
@ -677,6 +632,10 @@ instance's greatest bottleneck.
            download_data = page.get_download_forum_data()
            if not download_data:
                raise CrawlWarning("Failed to extract forum data")
            if download_data.empty:
                log.explain("Forum had no threads")
                elements = []
                return
            html = await self._post_authenticated(download_data.url, download_data.form_data)
            elements = parse_ilias_forum_export(soupify(html))
@ -804,14 +763,17 @@ class KitShibbolethLogin:
        # Equivalent: Click on "Mit KIT-Account anmelden" button in
        # https://ilias.studium.kit.edu/login.php
-        url = "https://ilias.studium.kit.edu/shib_login.php"
+        url = f"{_ILIAS_URL}/shib_login.php"
        data = {
            "sendLogin": "1",
            "idp_selection": "https://idp.scc.kit.edu/idp/shibboleth",
            "il_target": "",
            "home_organization_selection": "Weiter",
        }
-        soup: BeautifulSoup = await _shib_post(sess, url, data)
+        soup: Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful] = await _shib_post(sess, url, data)
        if isinstance(soup, KitShibbolethBackgroundLoginSuccessful):
            return
        # Attempt to login using credentials, if necessary
        while not self._login_successful(soup):
@ -850,7 +812,7 @@ class KitShibbolethLogin:
        # (or clicking "Continue" if you have JS disabled)
        relay_state = soup.find("input", {"name": "RelayState"})
        saml_response = soup.find("input", {"name": "SAMLResponse"})
-        url = "https://ilias.studium.kit.edu/Shibboleth.sso/SAML2/POST"
+        url = f"{_ILIAS_URL}/Shibboleth.sso/SAML2/POST"
        data = {  # using the info obtained in the while loop above
            "RelayState": relay_state["value"],
            "SAMLResponse": saml_response["value"],
@ -899,22 +861,35 @@ async def _post(session: aiohttp.ClientSession, url: str, data: Any) -> Beautifu
        return soupify(await response.read())
-async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> BeautifulSoup:
+async def _shib_post(
    session: aiohttp.ClientSession,
    url: str,
    data: Any
 ) -> Union[BeautifulSoup, KitShibbolethBackgroundLoginSuccessful]:
    """
    aiohttp unescapes '/' and ':' in URL query parameters which is not RFC compliant and rejected
    by Shibboleth. Thanks a lot. So now we unroll the requests manually, parse location headers and
    build encoded URL objects ourselves... Who thought mangling location header was a good idea??
    """
    log.explain_topic("Shib login POST")
    async with session.post(url, data=data, allow_redirects=False) as response:
        location = response.headers.get("location")
        log.explain(f"Got location {location!r}")
        if not location:
            raise CrawlWarning(f"Login failed (1), no location header present at {url}")
        correct_url = yarl.URL(location, encoded=True)
        log.explain(f"Corrected location to {correct_url!r}")
        if str(correct_url).startswith(_ILIAS_URL):
            log.explain("ILIAS recognized our shib token and logged us in in the background, returning")
            return KitShibbolethBackgroundLoginSuccessful()
        async with session.get(correct_url, allow_redirects=False) as response:
            location = response.headers.get("location")
            log.explain(f"Redirected to {location!r} with status {response.status}")
            # If shib still still has a valid session, it will directly respond to the request
            if location is None:
                log.explain("Shib recognized us, returning its response directly")
                return soupify(await response.read())
            as_yarl = yarl.URL(response.url)
@ -928,6 +903,7 @@ async def _shib_post(session: aiohttp.ClientSession, url: str, data: Any) -> Bea
                path=location,
                encoded=True
            )
            log.explain(f"Corrected location to {correct_url!r}")
            async with session.get(correct_url, allow_redirects=False) as response:
                return soupify(await response.read())
--- a/PFERD/crawl/kit_ipd_crawler.py
+++ b/PFERD/crawl/kit_ipd_crawler.py
@ -2,7 +2,7 @@ import os
 import re
 from dataclasses import dataclass
 from pathlib import PurePath
-from typing import Awaitable, List, Optional, Pattern, Set, Union
+from typing import List, Optional, Pattern, Set, Tuple, Union
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup, Tag
@ -64,67 +64,62 @@ class KitIpdCrawler(HttpCrawler):
        self._file_regex = section.link_regex()
    async def _run(self) -> None:
-        maybe_cl = await self.crawl(PurePath("."))
+        cl = await self.crawl(PurePath("."))
-        if not maybe_cl:
+        if not cl:
            return
-        tasks: List[Awaitable[None]] = []
+        async with cl:
        async with maybe_cl:
            for item in await self._fetch_items():
                if isinstance(item, KitIpdFolder):
-                    tasks.append(self._crawl_folder(item))
+                    await self._crawl_folder(item)
                else:
                    # Orphan files are placed in the root folder
-                    tasks.append(self._download_file(PurePath("."), item))
+                    await self._download_file(PurePath("."), item)
        await self.gather(tasks)
    async def _crawl_folder(self, folder: KitIpdFolder) -> None:
        path = PurePath(folder.name)
        if not await self.crawl(path):
            return
-        tasks = [self._download_file(path, file) for file in folder.files]
+        for file in folder.files:
-
+            await self._download_file(path, file)
        await self.gather(tasks)
    async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
        element_path = parent / file.name
-        maybe_dl = await self.download(element_path)
+        dl = await self.download(element_path)
-        if not maybe_dl:
+        if not dl:
            return
-        async with maybe_dl as (bar, sink):
+        async with dl as (bar, sink):
            await self._stream_from_url(file.url, sink, bar)
    async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
-        page = await self.get_page()
+        page, url = await self._get_page()
        elements: List[Tag] = self._find_file_links(page)
        items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
        for element in elements:
            folder_label = self._find_folder_label(element)
            if folder_label:
-                folder = self._extract_folder(folder_label)
+                folder = self._extract_folder(folder_label, url)
                if folder not in items:
                    items.add(folder)
                    folder.explain()
            else:
-                file = self._extract_file(element)
+                file = self._extract_file(element, url)
                items.add(file)
                log.explain_topic(f"Orphan file {file.name!r} (href={file.url!r})")
                log.explain("Attributing it to root folder")
        return items
-    def _extract_folder(self, folder_tag: Tag) -> KitIpdFolder:
+    def _extract_folder(self, folder_tag: Tag, url: str) -> KitIpdFolder:
        files: List[KitIpdFile] = []
        name = folder_tag.getText().strip()
        container: Tag = folder_tag.findNextSibling(name="table")
        for link in self._find_file_links(container):
-            files.append(self._extract_file(link))
+            files.append(self._extract_file(link, url))
        return KitIpdFolder(name, files)
@ -135,16 +130,16 @@ class KitIpdCrawler(HttpCrawler):
            return None
        return enclosing_table.findPreviousSibling(name=re.compile("^h[1-6]$"))
-    def _extract_file(self, link: Tag) -> KitIpdFile:
+    def _extract_file(self, link: Tag, url: str) -> KitIpdFile:
-        url = self._abs_url_from_link(link)
+        url = self._abs_url_from_link(url, link)
        name = os.path.basename(url)
        return KitIpdFile(name, url)
    def _find_file_links(self, tag: Union[Tag, BeautifulSoup]) -> List[Tag]:
        return tag.findAll(name="a", attrs={"href": self._file_regex})
-    def _abs_url_from_link(self, link_tag: Tag) -> str:
+    def _abs_url_from_link(self, url: str, link_tag: Tag) -> str:
-        return urljoin(self._url, link_tag.get("href"))
+        return urljoin(url, link_tag.get("href"))
    async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None:
        async with self.session.get(url, allow_redirects=False) as resp:
@ -159,12 +154,12 @@ class KitIpdCrawler(HttpCrawler):
            sink.done()
-    async def get_page(self) -> BeautifulSoup:
+    async def _get_page(self) -> Tuple[BeautifulSoup, str]:
-        async with self.session.get(self._url) as request:
+        response = self.session.get(self._url)
-            # The web page for Algorithmen für Routenplanung contains some
+
-            # weird comments that beautifulsoup doesn't parse correctly. This
+        # The web page for Algorithmen für Routenplanung contains some
-            # hack enables those pages to be crawled, and should hopefully not
+        # weird comments that beautifulsoup doesn't parse correctly. This
-            # cause issues on other pages.
+        # hack enables those pages to be crawled, and should hopefully not
-            content = (await request.read()).decode("utf-8")
+        # cause issues on other pages.
-            content = re.sub(r"<!--.*?-->", "", content)
+        content = re.sub(r"<!--.*?-->", "", response.text)
-            return soupify(content.encode("utf-8"))
+        return soupify(content.encode("utf-8")), str(request.url)
--- a/PFERD/crawl/local_crawler.py
+++ b/PFERD/crawl/local_crawler.py
@ -71,8 +71,6 @@ class LocalCrawler(Crawler):
        if not cl:
            return
        tasks = []
        async with cl:
            await asyncio.sleep(random.uniform(
                0.5 * self._crawl_delay,
@ -81,9 +79,7 @@ class LocalCrawler(Crawler):
            for child in path.iterdir():
                pure_child = cl.path / child.name
-                tasks.append(self._crawl_path(child, pure_child))
+                await self._crawl_path(child, pure_child)
        await self.gather(tasks)
    async def _crawl_file(self, path: Path, pure: PurePath) -> None:
        stat = path.stat()
--- a/PFERD/limiter.py
+++ b/PFERD/limiter.py
@ -1,97 +0,0 @@
 import asyncio
 import time
 from contextlib import asynccontextmanager
 from dataclasses import dataclass
 from typing import AsyncIterator, Optional
@dataclass
 class Slot:
    active: bool = False
    last_left: Optional[float] = None
 class Limiter:
    def __init__(
            self,
            task_limit: int,
            download_limit: int,
            task_delay: float
    ):
        if task_limit <= 0:
            raise ValueError("task limit must be at least 1")
        if download_limit <= 0:
            raise ValueError("download limit must be at least 1")
        if download_limit > task_limit:
            raise ValueError("download limit can't be greater than task limit")
        if task_delay < 0:
            raise ValueError("Task delay must not be negative")
        self._slots = [Slot() for _ in range(task_limit)]
        self._downloads = download_limit
        self._delay = task_delay
        self._condition = asyncio.Condition()
    def _acquire_slot(self) -> Optional[Slot]:
        for slot in self._slots:
            if not slot.active:
                slot.active = True
                return slot
        return None
    async def _wait_for_slot_delay(self, slot: Slot) -> None:
        if slot.last_left is not None:
            delay = slot.last_left + self._delay - time.time()
            if delay > 0:
                await asyncio.sleep(delay)
    def _release_slot(self, slot: Slot) -> None:
        slot.last_left = time.time()
        slot.active = False
    @asynccontextmanager
    async def limit_crawl(self) -> AsyncIterator[None]:
        slot: Slot
        async with self._condition:
            while True:
                if found_slot := self._acquire_slot():
                    slot = found_slot
                    break
                await self._condition.wait()
        await self._wait_for_slot_delay(slot)
        try:
            yield
        finally:
            async with self._condition:
                self._release_slot(slot)
                self._condition.notify_all()
    @asynccontextmanager
    async def limit_download(self) -> AsyncIterator[None]:
        slot: Slot
        async with self._condition:
            while True:
                if self._downloads <= 0:
                    await self._condition.wait()
                    continue
                if found_slot := self._acquire_slot():
                    slot = found_slot
                    self._downloads -= 1
                    break
                await self._condition.wait()
        await self._wait_for_slot_delay(slot)
        try:
            yield
        finally:
            async with self._condition:
                self._release_slot(slot)
                self._downloads += 1
                self._condition.notify_all()
--- a/PFERD/utils.py
+++ b/PFERD/utils.py
@ -92,17 +92,32 @@ def url_set_query_params(url: str, params: Dict[str, str]) -> str:
 def str_path(path: PurePath) -> str:
    """
    Turn a path into a string, in a platform-independent way.
    This function always uses "/" as path separator, even on Windows.
    """
    if not path.parts:
        return "."
    return "/".join(path.parts)
 def fmt_path(path: PurePath) -> str:
    """
    Turn a path into a delimited string.
    This is useful if file or directory names contain weird characters like
    newlines, leading/trailing whitespace or unprintable characters. This way,
    they are escaped and visible to the user.
    """
    return repr(str_path(path))
 def fmt_real_path(path: Path) -> str:
-    return repr(str(path.absolute()))
+    """
    Like fmt_path, but resolves the path before converting it to a string.
    """
    return fmt_path(path.absolute())
 class ReusableAsyncContextManager(ABC, Generic[T]):
--- a/PFERD/version.py
+++ b/PFERD/version.py
@ -1,2 +1,2 @@
 NAME = "PFERD"
-VERSION = "3.4.1"
+VERSION = "3.4.3"
--- a/README.md
+++ b/README.md
@ -30,7 +30,10 @@ The use of [venv](https://docs.python.org/3/library/venv.html) is recommended.
 Unofficial packages are available for:
 - [AUR](https://aur.archlinux.org/packages/pferd)
 - [brew](https://formulae.brew.sh/formula/pferd)
 - [conda-forge](https://github.com/conda-forge/pferd-feedstock)
 - [nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/tools/misc/pferd/default.nix)
 - [PyPi](https://pypi.org/project/pferd)
 See also PFERD's [repology page](https://repology.org/project/pferd/versions).
--- a/scripts/setup
+++ b/scripts/setup
@ -14,4 +14,4 @@ pip install --editable .
 # Installing tools and type hints
 pip install --upgrade mypy flake8 autopep8 isort pyinstaller
-pip install --upgrade types-chardet types-certifi
+mypy PFERD --install-types --non-interactive
--- a/setup.cfg
+++ b/setup.cfg
@ -11,6 +11,7 @@ install_requires =
  rich>=11.0.0
  keyring>=23.5.0
  certifi>=2021.10.8
  requests>=2.28.1
 [options.entry_points]
 console_scripts =
Author	SHA1	Message	Date
Joscha	bf27f4a686	TODO	2023-04-19 10:13:36 +02:00
Joscha	5adfdfbd2b	Switch http_crawler to requests	2023-04-19 10:12:48 +02:00
Joscha	5c3942a13d	Fix flake8 error	2023-04-19 10:12:48 +02:00
Joscha	5c9209b12e	Document path formatting functions	2023-04-19 10:12:48 +02:00
Joscha	50c7778d38	Use mypy to install library stub packages	2023-04-19 10:12:48 +02:00
Joscha	354a22d1e3	Add vscode settings	2023-04-19 10:12:48 +02:00
Joscha	6f87c5c774	Make ipd crawler synchronous	2023-04-19 10:12:48 +02:00
Joscha	1ca10571f0	Remove limiter	2023-04-19 10:12:48 +02:00
I-Al-Istannen	10e1a5e871	De-Async ilias crawler	2023-04-19 10:12:48 +02:00
Joscha	a2ffce4702	Make local crawler synchronous	2023-04-19 10:12:48 +02:00
I-Al-Istannen	0294ceb7d5	Update github action versions	2023-03-22 00:10:54 +01:00
I-Al-Istannen	6f30c6583d	Fix crawling of cards without descriptions	2023-03-21 23:52:33 +01:00
I-Al-Istannen	467fc526e8	Fix crawling of file/video cards	2023-03-21 23:52:24 +01:00
I-Al-Istannen	722d2eb393	Fix crawling of courses with preselected timeline tab	2023-03-21 23:36:47 +01:00
Joscha	6d44aac278	Bump version to 3.4.3	2022-11-29 18:22:19 +01:00
c0derMo	55a2de6b88	Fix crawling English opencast	2022-11-29 18:13:56 +01:00
Joscha	c0d6d8b229	Use url after redirect for relative links	2022-11-21 18:10:45 +01:00
Joscha	635caa765d	Fix typo Thanks, burg113	2022-11-15 17:17:57 +01:00
Pavel Zwerschke	e69b55b349	Add more unofficial package managers (#66 )	2022-11-04 12:18:26 +01:00
Joscha	07200bbde5	Document ilias web crawler's forums option	2022-10-31 14:12:27 +01:00
I-Al-Istannen	c020cccc64	Include found paths in "second path found" warning	2022-10-29 14:08:29 +02:00
Joscha	259cfc20cc	Bump version to 3.4.2	2022-10-26 18:26:17 +02:00
Joscha	37b51a66d8	Update changelog	2022-10-26 18:22:37 +02:00
I-Al-Istannen	f47d2f11d8	Append trailing slash to kit-ipd links to ensure urljoin works as expected	2022-10-25 20:28:22 +02:00
I-Al-Istannen	1b6be6bd79	Handle content pages in cards	2022-10-24 18:37:26 +02:00
I-Al-Istannen	e1430e6298	Handle (and ignore) surveys	2022-10-24 18:37:26 +02:00
I-Al-Istannen	5fdd40204b	Unwrap future meetings when ILIAS hides them behind a pagination	2022-10-24 14:33:58 +02:00
I-Al-Istannen	fb4631ba18	Fix ilias background login	2022-10-24 13:13:36 +02:00
I-Al-Istannen	d72fc2760b	Handle empty forums	2022-10-24 13:12:17 +02:00
I-Al-Istannen	4a51aaa4f5	Fix forum crawling crashing for empty threads	2022-10-19 22:59:33 +02:00
`@ -1,2 +1,2 @@`
	`NAME = "PFERD"`	`NAME = "PFERD"`
	`VERSION = "3.4.1"`	`VERSION = "3.4.3"`