异步爬虫模板--aiohttp

import pymysql
import requests
import asyncio
import aiohttp
from fake_useragent import UserAgent
from pandas import read_excel, DataFrame

# 并发量
CONCURRENCY = 10
semaphore = asyncio.Semaphore(CONCURRENCY)
session = None

base_url = 'www.baidu.com'


# 获取ip
def get_proxy():
    return requests.get("http://127.0.0.1:5010/get/").json()


# 删除ip
def delete_proxy(proxy):
    requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))


async def getDetail(name):
    async with semaphore:
        con = pymysql.connect(
            host='localhost',
            user='root',
            passwd='xxxx',
            db='xxxx',
            charset='utf8'
        )
        c = con.cursor()
        List = []
        for id in List:
            dic = {'': '', }
            ua = UserAgent().random
            headers = {
                'user-agent': ua,
            }

            json_data = {

            }
            # 设置重试次数
            retry_count = 3
            proxy = get_proxy().get("proxy")
            while True:
                try:
                    async with session.post(base_url, proxy=f'http://{proxy}',
                                            headers=headers, json=json_data, timeout=5) as response:
                        res = await response.json()
                        # 解析json数据
                        try:
                            dic[''] = res['']
                        except:
                            pass
                        print(dic)
                        data = DataFrame([dic])
                        tableName = ''
                        DBUtils.insert_data(DBUtils(con, c, data, tableName))
                        break
                except Exception as e:
                    print(e)
                    retry_count -= 1
                    if retry_count == 0:
                        delete_proxy(proxy)
                        retry_count = 3
                        proxy = get_proxy().get("proxy")
        c.close()
        con.close()


async def main():
    global session
    session = aiohttp.ClientSession()
    List = read_excel(r'文件路径')['列名'].values
    await asyncio.gather(*[getDetail(name) for name in List])


class DBUtils:
    """
    数据库工具类
    """

    """:param
    db:     数据库连接:  db = pymysql.connect(host='192.168.1.1', user='root', password='1234', port=3306, db='database_name')
    cursor: 数据库游标:  cursor = db.cursor()
    data:   需写入数据:  Dataframe
    table:  写入表名    
    """

    def __init__(self, db, cursor, data, table):
        self.db = db
        self.cursor = cursor
        self.data = data
        self.table = table

    # 按主键去重追加更新
    def insert_data(self):
        self.cursor.execute(
            "CREATE TABLE IF NOT EXISTS `{}`(`酒店id` int(0) NOT NULL,`酒店名称` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`品牌` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`档次` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`省份` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`城市` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`商圈` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`地址` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`经度` decimal(10, 7) NULL DEFAULT NULL,`纬度` decimal(10, 7) NULL DEFAULT NULL,`房间数` int(0) NULL DEFAULT NULL,`开业时间` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`装修时间` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`简介` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL,`评分` varchar(5) NULL DEFAULT NULL,`房型数` int(0) NULL DEFAULT NULL,`电话` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,`房型详情` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL,PRIMARY KEY (`酒店id`) USING BTREE)ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;".format(
                self.table)
        )
        keys = ', '.join('`' + self.data.keys() + '`')
        values = ', '.join(['%s'] * len(self.data.columns))
        # 根据表的唯一主键去重追加更新
        sql = 'INSERT INTO `{table}` ({keys}) VALUES ({values}) ON DUPLICATE KEY UPDATE'.format(table=self.table,
                                                                                                keys=keys,
                                                                                                values=values)

        update = ','.join(["`{key}` = %s".format(key=key) for key in self.data])
        sql += update

        for i in range(len(self.data)):
            try:
                self.cursor.execute(sql, tuple(self.data.loc[i]) * 2)
                # print('正在写入第%d条数据' % (i + 1))
                self.db.commit()
            except Exception as e:
                print("数据写入失败,原因为:%s", e)
                self.db.rollback()


if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

功能:将接口爬取的数据转换成DataFrame格式后直接存入mysql数据库中

需要搭配ip代理池使用,当然如果不考虑反爬也可以不挂ip

猜你喜欢

转载自blog.csdn.net/Achernar0208/article/details/126390837