mirror of
https://github.com/Garmelon/PFERD.git
synced 2023-12-21 10:23:01 +01:00
Make ipd crawler synchronous
This commit is contained in:
parent
1ca10571f0
commit
6f87c5c774
@ -64,42 +64,37 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
self._file_regex = section.link_regex()
|
self._file_regex = section.link_regex()
|
||||||
|
|
||||||
async def _run(self) -> None:
|
async def _run(self) -> None:
|
||||||
maybe_cl = await self.crawl(PurePath("."))
|
cl = await self.crawl(PurePath("."))
|
||||||
if not maybe_cl:
|
if not cl:
|
||||||
return
|
return
|
||||||
|
|
||||||
tasks: List[Awaitable[None]] = []
|
async with cl:
|
||||||
|
|
||||||
async with maybe_cl:
|
|
||||||
for item in await self._fetch_items():
|
for item in await self._fetch_items():
|
||||||
if isinstance(item, KitIpdFolder):
|
if isinstance(item, KitIpdFolder):
|
||||||
tasks.append(self._crawl_folder(item))
|
await self._crawl_folder(item)
|
||||||
else:
|
else:
|
||||||
# Orphan files are placed in the root folder
|
# Orphan files are placed in the root folder
|
||||||
tasks.append(self._download_file(PurePath("."), item))
|
await self._download_file(PurePath("."), item)
|
||||||
|
|
||||||
await self.gather(tasks)
|
|
||||||
|
|
||||||
async def _crawl_folder(self, folder: KitIpdFolder) -> None:
|
async def _crawl_folder(self, folder: KitIpdFolder) -> None:
|
||||||
path = PurePath(folder.name)
|
path = PurePath(folder.name)
|
||||||
if not await self.crawl(path):
|
if not await self.crawl(path):
|
||||||
return
|
return
|
||||||
|
|
||||||
tasks = [self._download_file(path, file) for file in folder.files]
|
for file in folder.files:
|
||||||
|
await self._download_file(path, file)
|
||||||
await self.gather(tasks)
|
|
||||||
|
|
||||||
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
|
async def _download_file(self, parent: PurePath, file: KitIpdFile) -> None:
|
||||||
element_path = parent / file.name
|
element_path = parent / file.name
|
||||||
maybe_dl = await self.download(element_path)
|
dl = await self.download(element_path)
|
||||||
if not maybe_dl:
|
if not dl:
|
||||||
return
|
return
|
||||||
|
|
||||||
async with maybe_dl as (bar, sink):
|
async with dl as (bar, sink):
|
||||||
await self._stream_from_url(file.url, sink, bar)
|
await self._stream_from_url(file.url, sink, bar)
|
||||||
|
|
||||||
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
async def _fetch_items(self) -> Set[Union[KitIpdFile, KitIpdFolder]]:
|
||||||
page, url = await self.get_page()
|
page, url = await self._get_page()
|
||||||
elements: List[Tag] = self._find_file_links(page)
|
elements: List[Tag] = self._find_file_links(page)
|
||||||
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
items: Set[Union[KitIpdFile, KitIpdFolder]] = set()
|
||||||
|
|
||||||
@ -159,7 +154,7 @@ class KitIpdCrawler(HttpCrawler):
|
|||||||
|
|
||||||
sink.done()
|
sink.done()
|
||||||
|
|
||||||
async def get_page(self) -> Tuple[BeautifulSoup, str]:
|
async def _get_page(self) -> Tuple[BeautifulSoup, str]:
|
||||||
async with self.session.get(self._url) as request:
|
async with self.session.get(self._url) as request:
|
||||||
# The web page for Algorithmen für Routenplanung contains some
|
# The web page for Algorithmen für Routenplanung contains some
|
||||||
# weird comments that beautifulsoup doesn't parse correctly. This
|
# weird comments that beautifulsoup doesn't parse correctly. This
|
||||||
|
Loading…
Reference in New Issue
Block a user