东方财富网数据抓取 | 爬虫

import requests
from lxml import etree
import re
import time
import json
import multiprocessing


class MyProcess(multiprocessing.Process):
    def __init__(self, url):
        super(MyProcess, self).__init__()
        self.url = url

    def return_result(self):
        pass

    def run(self):
        self.return_result()


start_time = time.time()
# 构造请求头
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    'Referer': 'http://quote.eastmoney.com/centerv2/xsb',
}
# 动态ip代理,可以自己搭建ip代理池,这里我在售卖代理ip的官网,复制了他们提供测试的代理ip;
proxies = {
    'https://': 'https://117.90.4.11:41629',
}

def get_page(url, q):
    response = requests.get(url, headers=headers, proxies=proxies)
    # 3.通过队列存取数据,进程间通信
    q.put(response.text)
    
i = 1
data_list = []
q = multiprocessing.Queue()
while True:
    print('===========================%s'%i)
    # 测试使用,限制抓取页数
    if i == 20:
        break
    # 1.构造url
    dest_url = """http://nufm.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?type=CT&cmd=C._81.BASIC&sty=FCOIA&sortType=(ChangePercent)&sortRule=-1&page={0}&pageSize=20&token=7bc05d0d4c3c22ef9fca8c2a912d779c&jsName=quote_123&_g=0.628606915911589&_=1526803275980""".format(i)
    # 2.多进程发起请求
    p = multiprocessing.Process(target=get_page, args=(dest_url, q))
    p.start()
    # 队列存取的间隙
    time.sleep(0.1)
    if not q.empty():
        res_content = q.get()
        # 3.获取字符串的处理
        filed_list = res_content[3:-3].split('","')
        for data in filed_list:
            data = data.split(',')
            code = data[1]
            name = data[2]
            new_price = data[3]
            change_price = data[4]
            change_rate = data[5]
            done_price = data[6]
            yestd_input = data[7]
            today_input = data[8]
            highest = data[9]
            lowest = data[10]
            data_dict = {
                "code": code,
                "name": name,
                "new_price": new_price,
                "change_price": change_price,
                "change_rate": change_rate,
                "done_price": done_price,
                "yestd_input": yestd_input,
                "today_input": today_input,
                "highest": highest,
                "lowest": lowest,
            }
            # 4.添加数据字典至全局列表中
            data_list.append(data_dict)
    # 5.while循环来构造所有分页的url
    i += 1
# 6.数据存储
f = open("file.json","w")
f.write(json.dumps(data_list, ensure_ascii=False))
f.close()
end_time = time.time()
print("当前抓起所用时间: ", end_time-start_time)

猜你喜欢

转载自www.cnblogs.com/pymkl/p/9089761.html