aio 爬虫,去重,入库

#aio 爬虫,去重,入库
import asyncio
import aiohttp
import aiomysql
import re
from pyquery import PyQuery

stoping = False
start_url = 'http://www.jobbole.com/'

waiting_urls = []
seen_urls = set()
# url去重 --布隆过滤器 bloom filter

async def fetch(url,session):
    try:
        async with session.get(url) as resp:
            print(resp.status)
            if resp.status in [200,201]:
                data = await resp.text()
                return data
    except Exception as e :
        print(e)


#因为不是耗费 io的 所以用普通函数
def extract_urls(html):
    urls = []
    pq = PyQuery(html)
    for link in pq.items('a'):
        url = link.attr('href')
        if url and url.startswith('http') and url not in seen_urls:
            urls.append(url)
            waiting_urls.append(url)
    return urls


async def init_urls(url,session):
    html = await fetch(url,session)
    seen_urls.add(url)
    extract_urls(html)

async def article_handeler(url,session,pool):
    #获取文章详情,并解析入库
    html = await fetch(url,session)
    seen_urls.add(url)
    extract_urls(html)
    pq = PyQuery(html)
    title = pq('title').text()
    async with pool.acquire() as conn:
        async with conn.cursor() as cur:
            await cur.execute('SELECT 42;')
            insert_sql = 'insert into aiomysql_test(title) VALUES ("{}")'.format(title)
            await cur.execute(insert_sql)


async def consumer(pool):
    while not stoping:
        if len(waiting_urls) == 0:
            await asyncio.sleep(0.5)
            continue
        url = waiting_urls.pop()
        print('start get url:{}'.format(url))
        if re.match('http://.*?jobbole.com/\d+/',url):
            if url not in seen_urls:
                asyncio.ensure_future(article_handeler(url,pool))
        # else:
        #     if url not in seen_urls:
        #         asyncio.ensure_future(init_urls(url,session))


async def main(loop):
    #等待mysql链接建立好
    pool = await aiomysql.create_pool(host='127.0.0.1',port = 3306,
                                      user = 'root',password='root',
                                      db = 'aiomysql_test',loop=loop,
                                      charset = 'utf8',autocommit = True)
    async with aiohttp.ClientSession() as session:
        html = await fetch(start_url, session)
        seen_urls.add(start_url)
        extract_urls(html)

    asyncio.ensure_future(consumer(pool))

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    asyncio.ensure_future(main(loop))
    loop.run_forever()

猜你喜欢

转载自www.cnblogs.com/Erick-L/p/8939607.html