diff --git a/PFERD/crawl/crawler.py b/PFERD/crawl/crawler.py index 0e67c02..dd500e6 100644 --- a/PFERD/crawl/crawler.py +++ b/PFERD/crawl/crawler.py @@ -293,6 +293,8 @@ class Crawler(ABC): async def download( self, path: PurePath, + *, + etag_differs: Optional[bool] = None, mtime: Optional[datetime] = None, redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, @@ -307,7 +309,14 @@ class Crawler(ABC): log.status("[bold bright_black]", "Ignored", fmt_path(path)) return None - fs_token = await self._output_dir.download(path, transformed_path, mtime, redownload, on_conflict) + fs_token = await self._output_dir.download( + path, + transformed_path, + etag_differs=etag_differs, + mtime=mtime, + redownload=redownload, + on_conflict=on_conflict + ) if fs_token is None: log.explain("Answer: No") return None diff --git a/PFERD/crawl/http_crawler.py b/PFERD/crawl/http_crawler.py index 44ec4dd..39b22f3 100644 --- a/PFERD/crawl/http_crawler.py +++ b/PFERD/crawl/http_crawler.py @@ -1,8 +1,9 @@ import asyncio import http.cookies import ssl +from datetime import datetime from pathlib import Path, PurePath -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple import aiohttp import certifi @@ -15,6 +16,8 @@ from ..utils import fmt_real_path from ..version import NAME, VERSION from .crawler import Crawler, CrawlerSection +ETAGS_CUSTOM_REPORT_VALUE_KEY = "etags" + class HttpCrawlerSection(CrawlerSection): def http_timeout(self) -> float: @@ -169,6 +172,53 @@ class HttpCrawler(Crawler): log.warn(f"Failed to save cookies to {fmt_real_path(self._cookie_jar_path)}") log.warn(str(e)) + def _get_previous_etag_from_report(self, path: PurePath) -> Optional[str]: + """ + If available, retrieves the entity tag for a given path which was stored in the previous report. + """ + if not self._output_dir.prev_report: + return None + + etags = self._output_dir.prev_report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} + return etags.get(str(path)) + + def _add_etag_to_report(self, path: PurePath, etag: Optional[str]) -> None: + """ + Adds an entity tag for a given path to the report's custom values. + """ + if not etag: + return + + etags = self._output_dir.report.get_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY) or {} + etags[str(path)] = etag + self._output_dir.report.add_custom_value(ETAGS_CUSTOM_REPORT_VALUE_KEY, etags) + + async def _request_resource_version(self, resource_url: str) -> Tuple[Optional[str], Optional[datetime]]: + """ + Requests the ETag and Last-Modified headers of a resource via a HEAD request. + If no entity tag / modification date can be obtained, the according value will be None. + """ + try: + async with self.session.head(resource_url) as resp: + if resp.status != 200: + return None, None + + etag_header = resp.headers.get("ETag") + last_modified_header = resp.headers.get("Last-Modified") + + if last_modified_header: + try: + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#directives + datetime_format = "%a, %d %b %Y %H:%M:%S GMT" + last_modified = datetime.strptime(last_modified_header, datetime_format) + except ValueError: + # last_modified remains None + pass + + return etag_header, last_modified + except aiohttp.ClientError: + return None, None + async def run(self) -> None: self._request_count = 0 self._cookie_jar = aiohttp.CookieJar() diff --git a/PFERD/crawl/kit_ipd_crawler.py b/PFERD/crawl/kit_ipd_crawler.py index c852be0..d9515e2 100644 --- a/PFERD/crawl/kit_ipd_crawler.py +++ b/PFERD/crawl/kit_ipd_crawler.py @@ -1,6 +1,7 @@ import os import re from dataclasses import dataclass +from datetime import datetime from pathlib import PurePath from typing import Awaitable, List, Optional, Pattern, Set, Tuple, Union from urllib.parse import urljoin @@ -75,8 +76,11 @@ class KitIpdCrawler(HttpCrawler): if isinstance(item, KitIpdFolder): tasks.append(self._crawl_folder(item)) else: + # do this here to at least be sequential and not parallel (rate limiting is hard, as the + # crawl abstraction does not hold for these requests) + etag, mtime = await self._request_resource_version(item.url) # Orphan files are placed in the root folder - tasks.append(self._download_file(PurePath("."), item)) + tasks.append(self._download_file(PurePath("."), item, etag, mtime)) await self.gather(tasks) @@ -85,18 +89,36 @@ class KitIpdCrawler(HttpCrawler): if not await self.crawl(path): return - tasks = [self._download_file(path, file) for file in folder.files] + tasks = [] + for file in folder.files: + # do this here to at least be sequential and not parallel (rate limiting is hard, as the crawl + # abstraction does not hold for these requests) + etag, mtime = await self._request_resource_version(file.url) + tasks.append(self._download_file(path, file, etag, mtime)) await self.gather(tasks) - async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None: + async def _download_file( + self, + parent: PurePath, + file: KitIpdFile, + etag: Optional[str], + mtime: Optional[datetime] + ) -> None: element_path = parent / file.name - maybe_dl = await self.download(element_path) + + prev_etag = self._get_previous_etag_from_report(element_path) + etag_differs = None if prev_etag is None else prev_etag != etag + + maybe_dl = await self.download(element_path, etag_differs=etag_differs, mtime=mtime) if not maybe_dl: + # keep storing the known file's etag + if prev_etag: + self._add_etag_to_report(element_path, prev_etag) return async with maybe_dl as (bar, sink): - await self._stream_from_url(file.url, sink, bar) + await self._stream_from_url(file.url, element_path, sink, bar) async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]: page, url = await self.get_page() @@ -146,7 +168,7 @@ class KitIpdCrawler(HttpCrawler): def _abs_url_from_link(self, url: str, link_tag: Tag) -> str: return urljoin(url, link_tag.get("href")) - async def _stream_from_url(self, url: str, sink: FileSink, bar: ProgressBar) -> None: + async def _stream_from_url(self, url: str, path: PurePath, sink: FileSink, bar: ProgressBar) -> None: async with self.session.get(url, allow_redirects=False) as resp: if resp.status == 403: raise CrawlError("Received a 403. Are you within the KIT network/VPN?") @@ -159,6 +181,8 @@ class KitIpdCrawler(HttpCrawler): sink.done() + self._add_etag_to_report(path, resp.headers.get("ETag")) + async def get_page(self) -> Tuple[BeautifulSoup, str]: async with self.session.get(self._url) as request: # The web page for Algorithmen für Routenplanung contains some diff --git a/PFERD/output_dir.py b/PFERD/output_dir.py index e9e9b93..09cf133 100644 --- a/PFERD/output_dir.py +++ b/PFERD/output_dir.py @@ -57,6 +57,7 @@ class OnConflict(Enum): @dataclass class Heuristics: + etag_differs: Optional[bool] mtime: Optional[datetime] @@ -233,8 +234,16 @@ class OutputDirectory: remote_newer = None + # ETag should be a more reliable indicator than mtime, so we check it first + if heuristics.etag_differs is not None: + remote_newer = heuristics.etag_differs + if remote_newer: + log.explain("Remote file's entity tag differs") + else: + log.explain("Remote file's entity tag is the same") + # Python on Windows crashes when faced with timestamps around the unix epoch - if heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): + if remote_newer is None and heuristics.mtime and (os.name != "nt" or heuristics.mtime.year > 1970): mtime = heuristics.mtime remote_newer = mtime.timestamp() > stat.st_mtime if remote_newer: @@ -366,6 +375,8 @@ class OutputDirectory: self, remote_path: PurePath, path: PurePath, + *, + etag_differs: Optional[bool] = None, mtime: Optional[datetime] = None, redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, @@ -375,7 +386,7 @@ class OutputDirectory: MarkConflictError. """ - heuristics = Heuristics(mtime) + heuristics = Heuristics(etag_differs, mtime) redownload = self._redownload if redownload is None else redownload on_conflict = self._on_conflict if on_conflict is None else on_conflict local_path = self.resolve(path)