Make crawlers use transformers

This commit is contained in:
Joscha 2021-05-15 14:03:15 +02:00
parent 302b8c0c34
commit b0f731bf84
3 changed files with 11 additions and 3 deletions

View File

@ -273,6 +273,9 @@ class Crawler(ABC):
with self._conductor.progress_bar(desc, total=total) as bar: with self._conductor.progress_bar(desc, total=total) as bar:
yield bar yield bar
def should_crawl(self, path: PurePath) -> bool:
return self._transformer.transform(path) is not None
async def download( async def download(
self, self,
path: PurePath, path: PurePath,
@ -280,8 +283,12 @@ class Crawler(ABC):
redownload: Optional[Redownload] = None, redownload: Optional[Redownload] = None,
on_conflict: Optional[OnConflict] = None, on_conflict: Optional[OnConflict] = None,
) -> Optional[AsyncContextManager[FileSink]]: ) -> Optional[AsyncContextManager[FileSink]]:
transformed_path = self._transformer.transform(path)
if transformed_path is None:
return None
return await self._output_dir.download( return await self._output_dir.download(
path, mtime, redownload, on_conflict) transformed_path, mtime, redownload, on_conflict)
async def cleanup(self) -> None: async def cleanup(self) -> None:
await self._output_dir.cleanup() await self._output_dir.cleanup()

View File

@ -81,7 +81,8 @@ class LocalCrawler(Crawler):
for child in path.iterdir(): for child in path.iterdir():
pure_child = pure / child.name pure_child = pure / child.name
tasks.append(self._crawl_path(child, pure_child)) if self.should_crawl(child):
tasks.append(self._crawl_path(child, pure_child))
await asyncio.gather(*tasks) await asyncio.gather(*tasks)

View File

@ -292,4 +292,4 @@ class Transformer:
else: else:
continue continue
return None return path