mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Make crawlers use transformers
This commit is contained in:
parent
302b8c0c34
commit
b0f731bf84
@ -273,6 +273,9 @@ class Crawler(ABC):
|
|||||||
with self._conductor.progress_bar(desc, total=total) as bar:
|
with self._conductor.progress_bar(desc, total=total) as bar:
|
||||||
yield bar
|
yield bar
|
||||||
|
|
||||||
|
def should_crawl(self, path: PurePath) -> bool:
|
||||||
|
return self._transformer.transform(path) is not None
|
||||||
|
|
||||||
async def download(
|
async def download(
|
||||||
self,
|
self,
|
||||||
path: PurePath,
|
path: PurePath,
|
||||||
@ -280,8 +283,12 @@ class Crawler(ABC):
|
|||||||
redownload: Optional[Redownload] = None,
|
redownload: Optional[Redownload] = None,
|
||||||
on_conflict: Optional[OnConflict] = None,
|
on_conflict: Optional[OnConflict] = None,
|
||||||
) -> Optional[AsyncContextManager[FileSink]]:
|
) -> Optional[AsyncContextManager[FileSink]]:
|
||||||
|
transformed_path = self._transformer.transform(path)
|
||||||
|
if transformed_path is None:
|
||||||
|
return None
|
||||||
|
|
||||||
return await self._output_dir.download(
|
return await self._output_dir.download(
|
||||||
path, mtime, redownload, on_conflict)
|
transformed_path, mtime, redownload, on_conflict)
|
||||||
|
|
||||||
async def cleanup(self) -> None:
|
async def cleanup(self) -> None:
|
||||||
await self._output_dir.cleanup()
|
await self._output_dir.cleanup()
|
||||||
|
@ -81,7 +81,8 @@ class LocalCrawler(Crawler):
|
|||||||
|
|
||||||
for child in path.iterdir():
|
for child in path.iterdir():
|
||||||
pure_child = pure / child.name
|
pure_child = pure / child.name
|
||||||
tasks.append(self._crawl_path(child, pure_child))
|
if self.should_crawl(child):
|
||||||
|
tasks.append(self._crawl_path(child, pure_child))
|
||||||
|
|
||||||
await asyncio.gather(*tasks)
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
@ -292,4 +292,4 @@ class Transformer:
|
|||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return None
|
return path
|
||||||
|
Loading…
Reference in New Issue
Block a user