diff --git a/PFERD/crawler.py b/PFERD/crawler.py index 9ec5991..f8cf091 100644 --- a/PFERD/crawler.py +++ b/PFERD/crawler.py @@ -273,6 +273,9 @@ class Crawler(ABC): with self._conductor.progress_bar(desc, total=total) as bar: yield bar + def should_crawl(self, path: PurePath) -> bool: + return self._transformer.transform(path) is not None + async def download( self, path: PurePath, @@ -280,8 +283,12 @@ class Crawler(ABC): redownload: Optional[Redownload] = None, on_conflict: Optional[OnConflict] = None, ) -> Optional[AsyncContextManager[FileSink]]: + transformed_path = self._transformer.transform(path) + if transformed_path is None: + return None + return await self._output_dir.download( - path, mtime, redownload, on_conflict) + transformed_path, mtime, redownload, on_conflict) async def cleanup(self) -> None: await self._output_dir.cleanup() diff --git a/PFERD/crawlers/local.py b/PFERD/crawlers/local.py index 99bc700..360a9a9 100644 --- a/PFERD/crawlers/local.py +++ b/PFERD/crawlers/local.py @@ -81,7 +81,8 @@ class LocalCrawler(Crawler): for child in path.iterdir(): pure_child = pure / child.name - tasks.append(self._crawl_path(child, pure_child)) + if self.should_crawl(child): + tasks.append(self._crawl_path(child, pure_child)) await asyncio.gather(*tasks) diff --git a/PFERD/transformer.py b/PFERD/transformer.py index 84332df..fb47c60 100644 --- a/PFERD/transformer.py +++ b/PFERD/transformer.py @@ -292,4 +292,4 @@ class Transformer: else: continue - return None + return path