Use the Scrapy framework to write a crawler for eastmoney

Open http://guba.eastmoney. om/ and click more in the popular stocks bar 

When you enter this interface, you can see that the stock is basically available, and then click F12 to open the source code of the webpage 

First of all, you can check the menu, which is Lushi and Shenzhen. 

Then for convenience, simply list the starting address as follows

 

'http://guba.eastmoney.com/remenba.aspx?type=1&tab=1'
"http://guba.eastmoney.com/remenba.aspx?type=1&tab=2",
"http://guba.eastmoney.com/remenba.aspx?type=1&tab=3",
"http://guba.eastmoney.com/remenba.aspx?type=1&tab=4",
"http://guba.eastmoney.com/remenba.aspx?type=1&tab=5",
"http://guba.eastmoney.com/remenba.aspx?type=1&tab=6"

Then look at the information of each menu corresponding to the specific stock bar below

It can be seen that the webpage market is divided into two parts for display, that is, under the ul corresponding to ngblistul2 and ngblistul2 hide

 as follows

And click to see more other stocks later

We click on one of them and check the following web page structure

 

That is, the href in the <a> tag is the information we want

Remember to splice the href under ngblistul2 and ngblistul2 hide

list = response.xpath('/html/body/div[1]/div[5]/div[2]/div[1]/div/ul[1]/li')
list2 = response.xpath('/html/body/div[1]/div[5]/div[2]/div[1]/div/ul[2]/li')
list.extend(list2)

   After obtaining the corresponding URL of the stock bar, we click on one of the webpages http://guba.eastmoney.com/list,600000.html

 Have some information like this

Which is the information we want to crawl to

Open the webpage source code

It is easy to see the information we want to obtain, but here is a pit

That is, only the stock bar name can be directly obtained, and things like the increase in trading volume cannot be directly crawled to it. Later, after I reloaded the web page,

Found his request to obtain data asynchronously

Since I don’t know what these parameters are, I can only look at the source code of js.

Finally found that the parameters only need to change cmd.

So the question is, how to set the cmd parameters is reasonable, because the stock code and cmd seem to be different

So I directly searched the location of 6000001 in the source code of the webpage, and found that the information we need exists in a script

That is, the quotecode inside is the cmd parameter we need, which is extracted as follows

quoteCode = response.xpath('//*/script').re(r"var QuoteCode = \"(.*)\";")[0]

After extraction, it is to simulate sending the request

quoteURL = "http://nufm.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?type=CT&cmd=" + quoteCode + "&sty=FDFGBTB&st=z&sr=&p=&ps=&lvl=&cb=?&js=&token=5c46f660fab8722944521b8807de07c0";
        getres = self.send(quoteURL)
        tt=str(getres.decode('utf8')[4:-3]).split(",")
        print(tt)
def send(self,url):
        req = request.Request(url=url, headers=self.headers)
        res = request.urlopen(req)
        res = res.read()
        return res
        # 输出内容(python3默认获取到的是16进制'bytes'类型数据 Unicode编码,如果如需可读输出则需decode解码成对应编码):b'\xe7\x99\xbb\xe5\xbd\x95\xe6\x88\x90\xe5\x8a\x9f'
        print(res.decode(encoding='utf-8'))
        # 输出内容:登录成功

 Okay, that's basically it

Next directly to the code

The first is a start.py script, used to start the crawler regularly and run at a specified time

import datetime
import time

from scrapy import cmdline


def doSth(hour,min):
    # 把爬虫程序放在这个类里 zhilian_spider 是爬虫的name
    straa = 'scrapy crawl eastmoney -o ' + str(hour)+str(min)+".csv"
    cmdline.execute(straa.split())


# 想几点更新,定时到几点
def time_ti():
    while True:
        now = datetime.datetime.now()
        # print(now.hour, now.minute)
        if now.hour == 9 and now.minute == 30:
            doSth(9,30)
        if now.hour == 14 and now.minute == 30:
            doSth(14,30)
        if now.hour == 22 and now.minute == 0:
            doSth(22,00)
        # 每隔60秒检测一次
        time.sleep(60)
if __name__ == '__main__':
    time_ti()

Then item.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class EastmoneyspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # 股票名字
    name = scrapy.Field()
    # 股票编码
    id = scrapy.Field()
    # 关注量
    man_number = scrapy.Field()
    # 当前热度排行
    now_hot = scrapy.Field()
    # 最新
    latest = scrapy.Field()
    # 涨跌
    zhangdie = scrapy.Field()
    # 涨幅
    zhangfu = scrapy.Field()
    # 成交量
    chengjiaoliang = scrapy.Field()
    # 成交额
    chengjiaoe = scrapy.Field()
    # 流通市值
    shizhi = scrapy.Field()
    # 换手
    huanshou = scrapy.Field()
    # 振幅
    zhenfu = scrapy.Field()
    # 市盈
    shiying = scrapy.Field()
    pass

 

  eastmoneyspider\spiders\eastmoney.py, which is the crawling process

# -*- coding: utf-8 -*-
import re
import time

import scrapy
from urllib import parse,request
from eastmoneyspider.items import EastmoneyspiderItem
class eastmoneySpider(scrapy.Spider):
    name = 'eastmoney'
    allowed_domains = ['guba.eastmoney.com']
    data = []
    # 浏览器用户代理
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3704.400 QQBrowser/10.4.3587.400'
    }
    # 指定cookies
    cookies = {
        'qgqp_b_id': 'c6fbd3a403db8993b4dd3eb2a320ccb6',
        'st_si': '73594088022242',
        'em_hq_fls': 'js',
        'st_asi': 'delete',
        'HAList': 'a-sh-600000-%u6D66%u53D1%u94F6%u884C%2Ca-sh-601388-%u6021%u7403%u8D44%u6E90%2Ca-sh-603019-%u4E2D%u79D1%u66D9%u5149',
        '_adsame_fullscreen_17590': '1',
        'st_pvi': '70133729415159',
        'st_sp': '2019-06-23%2013%3A22%3A52',
        'st_inirUrl': 'https%3A%2F%2Fwww.sogou.com%2Flink',
        'st_psi': '20190623160613932-117001300421-1256798903',
        'st_sn': '46'
    }
    urls = [
        'http://guba.eastmoney.com/remenba.aspx?type=1&tab=1'
        "http://guba.eastmoney.com/remenba.aspx?type=1&tab=2",
        "http://guba.eastmoney.com/remenba.aspx?type=1&tab=3",
        "http://guba.eastmoney.com/remenba.aspx?type=1&tab=4",
        "http://guba.eastmoney.com/remenba.aspx?type=1&tab=5",
        "http://guba.eastmoney.com/remenba.aspx?type=1&tab=6"
    ]

    # 重写start_requests方法
    def start_requests(self):
        for url in self.urls:
            time.sleep(0.2)
            yield scrapy.Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.parse)

    def parse(self, response):
        #这里是选择一中股票类型,列出他下面的所有的选项
        list = response.xpath('/html/body/div[1]/div[5]/div[2]/div[1]/div/ul[1]/li')
        list2 = response.xpath('/html/body/div[1]/div[5]/div[2]/div[1]/div/ul[2]/li')
        list.extend(list2)
        for i in list:
        #if True:
            #i = list[1]
            url = i.css('a::attr("href")').extract_first()
            detail = response.urljoin(url)
            yield scrapy.Request(url=detail, headers=self.headers, cookies=self.cookies, callback=self.parse2)
    def parse2(self,response):
        item = EastmoneyspiderItem()
        item['name'] = response.xpath('//*[@id="stockname"]/a/text()').extract_first()
        # 关注量
        item['man_number'] = response.xpath('//*[@id="stockheader"]/div[1]/span[1]/em/text()').extract_first()
        # 当前热度排行
        item['now_hot'] = response.xpath('//*[@id="stockheader"]/div[1]/span[2]/em/text()').extract_first()

        quoteCode = response.xpath('//*/script').re(r"var QuoteCode = \"(.*)\";")[0]
        quoteURL = "http://nufm.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?type=CT&cmd=" + quoteCode + "&sty=FDFGBTB&st=z&sr=&p=&ps=&lvl=&cb=?&js=&token=5c46f660fab8722944521b8807de07c0";
        getres = self.send(quoteURL)
        tt=str(getres.decode('utf8')[4:-3]).split(",")
        print(tt)

        item["id"]=tt[0]
        item['name'] = tt[3]
        # 最新
        item['latest'] = tt[4]
        # 涨跌
        item['zhangdie'] = tt[5]
        # 涨幅
        item['zhangfu'] = tt[6]
        # 成交量
        item['chengjiaoliang'] = tt[8]
        # 成交额
        item['chengjiaoe'] = tt[9]
        # 流通市值
        item['shizhi'] = tt[13]
        # 换手
        item['huanshou'] = tt[10]
        # 振幅
        item['zhenfu'] = tt[12]
        # 市盈
        item['shiying'] = tt[11]
        yield item

    def send(self,url):
        req = request.Request(url=url, headers=self.headers)
        res = request.urlopen(req)
        res = res.read()
        return res
        # 输出内容(python3默认获取到的是16进制'bytes'类型数据 Unicode编码,如果如需可读输出则需decode解码成对应编码):b'\xe7\x99\xbb\xe5\xbd\x95\xe6\x88\x90\xe5\x8a\x9f'
        print(res.decode(encoding='utf-8'))
        # 输出内容:登录成功

The basic code to be written is as above, the others are the default configuration, and you can modify it if necessary

By the way, remember to modify the header and cookie

Guess you like

Origin blog.csdn.net/qq_20176001/article/details/94148439