用lxml的xpath演示爬虫提取双色球开奖数据的标题,url,浏览数,日期内容 另外 附上正则爬取法

 人狠话不多,直接上源码

from  urllib import request,parse
from  urllib import error
import chardet
from lxml import etree
import csv


def shuangseqiu(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    req = request.Request(url, headers=headers)
    try:
        response = request.urlopen(req)
        resHtml = response.read()
        resHtml = resHtml.decode("utf-8", 'ignore')
        html = etree.HTML(resHtml)
        results = html.xpath('//tr[contains(@onmouseout,"this.style.background")]')
        # print(results)
        for site in results:
            # 期号的提取
            qihao = site.xpath('./td')
            qihaonum = qihao[0].text
            # #开奖日期的提取
            jackpotdate =  qihao[1].text
            # 红球的提取
            redball = qihao[2].text+qihao[3].text+qihao[4].text+qihao[5].text+qihao[6].text+qihao[7].text
            # 蓝球的提取
            blueball = qihao[8].text
            # 总投注额(元)的提取
            totalprice = qihao[9].text
            # 一等奖(注数)的提取
            # OneNumber= site.xpath('./td[@class="redColor sz12"]')
            # print(OneNumber[len(OneNumber)-1].text)
            OneNumber = qihao[10].text
            # 一等奖(奖金(元))的提取
            OnePrice= qihao[11].text
            # 二等奖(注数)的提取
            TwoNumber = qihao[12].text
            # 二等奖(奖金(元))的提取
            TwoPrice = qihao[13].text
            # 奖池滚存(元)的提取
            jackpot= qihao[14].text
            print("期号:%s,开奖日期:%s,红球:%s,蓝球:%s,总投注额(元):%s,一等奖(注数):%s,一等奖(奖金(元)):%s,二等奖(注数):%s,二等奖奖金(元):%s,奖池滚存(元):%s"%(qihaonum,jackpotdate,redball,blueball,totalprice ,OneNumber,OnePrice,TwoNumber,TwoPrice,jackpot))

            filename = './data1/shuangsheqiu'+ '.csv'
            with open(filename, 'a', encoding='utf-8') as file:
                wr = csv.writer(file)
                wr.writerow(["期号:%s,开奖日期:%s,红球:%s,蓝球:%s,总投注额(元):%s,一等奖(注数):%s,一等奖(奖金(元)):%s,二等奖(注数):%s,二等奖奖金(元):%s,奖池滚存(元):%s"%(qihaonum,jackpotdate,redball,blueball,totalprice ,OneNumber,OnePrice,TwoNumber,TwoPrice,jackpot)])
    except error.URLError as e:
        print(e)

if __name__ == "__main__":
    proxy = {"http": "118.31.220.3:8080"}
    proxy_support = request.ProxyHandler(proxy)
    opener = request.build_opener(proxy_support)
    request.install_opener(opener)
    url = "http://zst.aicai.com/ssq/openInfo/"
    shuangseqiu(url)

正则方法:

"""
level2:
3、双色球历史数据爬虫,爬取期数,开奖时间,红色球,蓝色球,一等奖,二等奖等信息
"""
from  urllib import request,parse
from  urllib import error
import chardet
from lxml import etree
import csv
import re


def shuangseqiu(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    req = request.Request(url, headers=headers)
    try:
        response = request.urlopen(req)
        resHtml = response.read()
        resHtml = resHtml.decode("utf-8", 'ignore')
        # 期号的提取
        qihao = r'<td .*?>(.*?)</td>'
        qihao_pattern = re.compile(qihao, re.I | re.S | re.M)
        qihao = qihao_pattern.findall(resHtml)
        # qihao = qihao[0]

        # 开奖日期的提取
        date = qihao[1]
        # date = r'<tbody>.*?<td .*?>(.*?)</td>'
        # qihao_pattern = re.compile(qihao, re.I | re.S | re.M)
        # qihao = qihao_pattern.findall(resHtml)
        # 总投注额(元)的提取
        totalprice = qihao[9]
        # 一等奖(注数)的提取
        OneNumber = qihao[10]
        # 一等奖(奖金(元))的提取
        OnePrice = qihao[11]
        # 二等奖(注数)的提取
        TwoNumber = qihao[12]
        # 二等奖(奖金(元))的提取
        TwoPrice = qihao[13]
        # 奖池滚存(元)的提取
        jackpot = qihao[14]
        # 红色球
        # redqiu = r'<td  class="redColor sz12" >(.*?)</td>'
        # pattern = re.compile(redqiu, re.I | re.S | re.M)
        # red_qiu = pattern.findall(resHtml)
        red_qiu = qihao[2]+qihao[3]+qihao[4]+qihao[5]+qihao[6]+qihao[7]
        # 蓝色球
        # blueqiu = r'<td  class="blueColor sz12" >(.*?)</td>'
        # bluepattern = re.compile(blueqiu, re.I | re.S | re.M)
        # blueqiu = bluepattern.findall(resHtml)
        blueqiu = qihao[8]
    except error.URLError as e:
        print(e)

if __name__ == "__main__":
    proxy = {"http": "118.31.220.3:8080"}
    proxy_support = request.ProxyHandler(proxy)
    opener = request.build_opener(proxy_support)
    request.install_opener(opener)
    url = "http://zst.aicai.com/ssq/openInfo/"
    shuangseqiu(url)

猜你喜欢

转载自blog.csdn.net/lzz781699880/article/details/81111852