美团爬虫进阶

import csv
import time
import threading
from get_cookie import get_cookie
from get_cookie import parse


def crow(n, l):  # 参数n 区分第几个线程，l存储url的列表
    lock = threading.Lock()
    sym = 0  # 是否连续三次抓取失败的标志位
    pc = get_cookie()  # 获取IP 和 Cookie
    m = 0  # 记录抓取的数量
    now = time.time()
    while True:
        if len(l) > 0:
            u = l.pop(0)
            ll = len(l)
            m += 1
            ttt = time.time() - now
            result = parse(u, pc, m, n, ll, ttt)
            mark = result[0]
            info = result[1]
            if mark == 2:
                time.sleep(1.5)
                result = parse(u, pc, m, n, ll, ttt)
                mark = result[0]
                info = result[1]
                if mark != 0:
                    sym += 1
            if mark == 1:
                pc = get_cookie()
                result = parse(u, pc, m, n, ll, ttt)
                mark = result[0]
                info = result[1]
                if mark != 0:
                    sym += 1
            if mark == 0:  # 抓取成功
                sym = 0
                lock.acquire()
                with open('G:\\2.csv', 'a', newline='', encoding='gb18030')as f:
                    write = csv.writer(f)
                    write.writerow(info)


                f.close()
                lock.release()
            if sym > 2:  # 连续三次抓取失败，换ip、cookie
                sym = 0
                pc = get_cookie()
        else:
            print('&&&&线程：%d结束' % n)
            break


if __name__ == '__main__':
    url_list = []
    with open('G:\\2.csv', 'r', encoding='gb18030')as f:
        read = csv.reader(f)
        for line in read:
            d_list = ['', '']
            url = 'https://meishi.meituan.com/i/poi/' + str(line[2]) + '?ct_poi=' + str(line[3])
            d_list[0] = url
            d_list[1] = line[1]
            url_list.append(d_list)
        f.close()
    th_list = []
    for i in range(1, 6):
        t = threading.Thread(target=crow, args=(i, url_list,))
        print('*****线程%d开始启动...' % i)
        t.start()
        th_list.append(t)
        time.sleep(30)
    for t in th_list:
        t.join()
https://blog.csdn.net/xing851483876/article/details/81842329?utm_source=blogxgwz3
猜你喜欢