mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Replace asyncio.gather with custom Crawler function
This commit is contained in:
parent
c0cecf8363
commit
29d5a40c57
@ -1,7 +1,8 @@
|
|||||||
|
import asyncio
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import Any, Awaitable, Callable, Dict, Optional, Tuple, TypeVar
|
from typing import Any, Awaitable, Callable, Dict, List, Optional, Sequence, Tuple, TypeVar
|
||||||
|
|
||||||
from rich.markup import escape
|
from rich.markup import escape
|
||||||
|
|
||||||
@ -228,6 +229,25 @@ class Crawler(ABC):
|
|||||||
section.on_conflict(),
|
section.on_conflict(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def gather(awaitables: Sequence[Awaitable[Any]]) -> List[Any]:
|
||||||
|
"""
|
||||||
|
Similar to asyncio.gather. However, in the case of an exception, all
|
||||||
|
still running tasks are cancelled and the exception is rethrown.
|
||||||
|
|
||||||
|
This should always be preferred over asyncio.gather in crawler code so
|
||||||
|
that an exception like CrawlError may actually stop the crawler.
|
||||||
|
"""
|
||||||
|
|
||||||
|
tasks = [asyncio.ensure_future(aw) for aw in awaitables]
|
||||||
|
result = asyncio.gather(*tasks)
|
||||||
|
try:
|
||||||
|
return await result
|
||||||
|
except: # noqa: E722
|
||||||
|
for task in tasks:
|
||||||
|
task.cancel()
|
||||||
|
raise
|
||||||
|
|
||||||
async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
|
async def crawl(self, path: PurePath) -> Optional[CrawlToken]:
|
||||||
log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
|
log.explain_topic(f"Decision: Crawl {fmt_path(path)}")
|
||||||
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import asyncio
|
|
||||||
import re
|
import re
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
|
from typing import Any, Awaitable, Callable, Dict, Optional, Set, TypeVar, Union
|
||||||
@ -215,7 +214,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
|||||||
# this method without having spawned a single task. Due to this we do
|
# this method without having spawned a single task. Due to this we do
|
||||||
# not need to cancel anything or worry about this gather call or the forks
|
# not need to cancel anything or worry about this gather call or the forks
|
||||||
# further up.
|
# further up.
|
||||||
await asyncio.gather(*tasks)
|
await self.gather(tasks)
|
||||||
|
|
||||||
await impl()
|
await impl()
|
||||||
|
|
||||||
@ -240,7 +239,7 @@ class KitIliasWebCrawler(HttpCrawler):
|
|||||||
# this method without having spawned a single task. Due to this we do
|
# this method without having spawned a single task. Due to this we do
|
||||||
# not need to cancel anything or worry about this gather call or the forks
|
# not need to cancel anything or worry about this gather call or the forks
|
||||||
# further up.
|
# further up.
|
||||||
await asyncio.gather(*tasks)
|
await self.gather(tasks)
|
||||||
|
|
||||||
await impl()
|
await impl()
|
||||||
|
|
||||||
|
@ -83,7 +83,7 @@ class LocalCrawler(Crawler):
|
|||||||
pure_child = pure / child.name
|
pure_child = pure / child.name
|
||||||
tasks.append(self._crawl_path(child, pure_child))
|
tasks.append(self._crawl_path(child, pure_child))
|
||||||
|
|
||||||
await asyncio.gather(*tasks)
|
await self.gather(tasks)
|
||||||
|
|
||||||
async def _crawl_file(self, path: Path, pure: PurePath) -> None:
|
async def _crawl_file(self, path: Path, pure: PurePath) -> None:
|
||||||
stat = path.stat()
|
stat = path.stat()
|
||||||
|
Loading…
Reference in New Issue
Block a user