Python asynchronous coroutine crawler error: [aiohttp.client_exceptions: ServerDisconnectedError: Server disconnected] initial solution

Initial code:

async def myRequest(data):
    url = data['url']
    header = data['header']
    country = data['country']
    category = data['category']
    async with aiohttp.ClientSession(headers = header) as session:
        async with await session.get(url = url) as resp:
            html = await resp.read()
            today = str(datetime.date.today())
            filePath = '%s.xlsx'%(category)
            absPath = os.path.join(dir, today, country, filePath)
            print(absPath, 3333)
            f = open(absPath, 'wb')
            f.write(html)
            f.close()

def main():
    tasks = []
	for url in urls:
		c= myRequest(requestDataList, headers)
		task = asyncio.ensure_future(c)
		tasks.append(task)
	loop = asyncio.get_event_loop()
	loop.run_until_complete(asyncio.wait(tasks))

if __name__ == '__main__':
	main()
async def download_page(url):
	async with aiohttp.ClientSession() as session:
		async with session.get(url) as resp:
			await result = resp.text()

async def main(urls):
	tasks = []	
	for url in urls:
		tasks.append(asyncio.create_task(download_page(url)))  # 我的python版本为3.9.6
	await asyncio.await(tasks)

if __name__ == '__main__':
	urls = [ url1, url2, …… ]
	asyncio.run(main(urls))

This is the most basic asynchronous coroutine framework. When the amount of data is not large, it can basically meet the requirements. However, if the amount of data is slightly larger, an error will be reported. The error messages I have collected are as follows:

  • aiohttp.client_exceptions.ClientOSError: [WinError 64] The specified network name is no longer available.
    Task exception was never retrieved

  • aiohttp.client_exceptions.ClientOSError: [WinError 121] The semaphore timeout has expired
    Task exception was never retrieved

  • aiohttp.client_exceptions.ServerDisconnectedError: Server disconnected
    Task exception was never retrieved

Solutions:

The big problem with the above error report is that each task creates a session, and an error will be reported when too many sessions are created.

Solution:

try to create only one session

import asyncio
import aiohttp

async def download_page(url,session):
	async with session.get(url) as resp:
		result = await resp.content.read()
		print(result)

async def main(urls):
	tasks = []	
	async with aiohttp.ClientSession() as session:  # 将创建session,放在main函数中,同时,需要注意将session作为变量传入download_page函数中
		for url in urls:
			tasks.append(asyncio.create_task(download_page(url,session)))
			# 我的python版本为3.9.6,python版本3.8及以上,如果需要创建异步任务,需要通过asyncio.creat_task()创建,否则虽然可以正常运行,但是会出警告信息
		await asyncio.wait(tasks)

if __name__ == '__main__':
	urls = [ url1, url2, …… ]
	asyncio.run(main(urls))

Guess you like

Origin blog.csdn.net/zhuan_long/article/details/130340436