人狠话不多,直接上源码
from urllib import request,parse
from urllib import error
import chardet
from lxml import etree
import csv
def shuangseqiu(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
req = request.Request(url, headers=headers)
try:
response = request.urlopen(req)
resHtml = response.read()
resHtml = resHtml.decode("utf-8", 'ignore')
html = etree.HTML(resHtml)
results = html.xpath('//tr[contains(@onmouseout,"this.style.background")]')
# print(results)
for site in results:
# 期号的提取
qihao = site.xpath('./td')
qihaonum = qihao[0].text
# #开奖日期的提取
jackpotdate = qihao[1].text
# 红球的提取
redball = qihao[2].text+qihao[3].text+qihao[4].text+qihao[5].text+qihao[6].text+qihao[7].text
# 蓝球的提取
blueball = qihao[8].text
# 总投注额(元)的提取
totalprice = qihao[9].text
# 一等奖(注数)的提取
# OneNumber= site.xpath('./td[@class="redColor sz12"]')
# print(OneNumber[len(OneNumber)-1].text)
OneNumber = qihao[10].text
# 一等奖(奖金(元))的提取
OnePrice= qihao[11].text
# 二等奖(注数)的提取
TwoNumber = qihao[12].text
# 二等奖(奖金(元))的提取
TwoPrice = qihao[13].text
# 奖池滚存(元)的提取
jackpot= qihao[14].text
print("期号:%s,开奖日期:%s,红球:%s,蓝球:%s,总投注额(元):%s,一等奖(注数):%s,一等奖(奖金(元)):%s,二等奖(注数):%s,二等奖奖金(元):%s,奖池滚存(元):%s"%(qihaonum,jackpotdate,redball,blueball,totalprice ,OneNumber,OnePrice,TwoNumber,TwoPrice,jackpot))
filename = './data1/shuangsheqiu'+ '.csv'
with open(filename, 'a', encoding='utf-8') as file:
wr = csv.writer(file)
wr.writerow(["期号:%s,开奖日期:%s,红球:%s,蓝球:%s,总投注额(元):%s,一等奖(注数):%s,一等奖(奖金(元)):%s,二等奖(注数):%s,二等奖奖金(元):%s,奖池滚存(元):%s"%(qihaonum,jackpotdate,redball,blueball,totalprice ,OneNumber,OnePrice,TwoNumber,TwoPrice,jackpot)])
except error.URLError as e:
print(e)
if __name__ == "__main__":
proxy = {"http": "118.31.220.3:8080"}
proxy_support = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_support)
request.install_opener(opener)
url = "http://zst.aicai.com/ssq/openInfo/"
shuangseqiu(url)
正则方法:
"""
level2:
3、双色球历史数据爬虫,爬取期数,开奖时间,红色球,蓝色球,一等奖,二等奖等信息
"""
from urllib import request,parse
from urllib import error
import chardet
from lxml import etree
import csv
import re
def shuangseqiu(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
req = request.Request(url, headers=headers)
try:
response = request.urlopen(req)
resHtml = response.read()
resHtml = resHtml.decode("utf-8", 'ignore')
# 期号的提取
qihao = r'<td .*?>(.*?)</td>'
qihao_pattern = re.compile(qihao, re.I | re.S | re.M)
qihao = qihao_pattern.findall(resHtml)
# qihao = qihao[0]
# 开奖日期的提取
date = qihao[1]
# date = r'<tbody>.*?<td .*?>(.*?)</td>'
# qihao_pattern = re.compile(qihao, re.I | re.S | re.M)
# qihao = qihao_pattern.findall(resHtml)
# 总投注额(元)的提取
totalprice = qihao[9]
# 一等奖(注数)的提取
OneNumber = qihao[10]
# 一等奖(奖金(元))的提取
OnePrice = qihao[11]
# 二等奖(注数)的提取
TwoNumber = qihao[12]
# 二等奖(奖金(元))的提取
TwoPrice = qihao[13]
# 奖池滚存(元)的提取
jackpot = qihao[14]
# 红色球
# redqiu = r'<td class="redColor sz12" >(.*?)</td>'
# pattern = re.compile(redqiu, re.I | re.S | re.M)
# red_qiu = pattern.findall(resHtml)
red_qiu = qihao[2]+qihao[3]+qihao[4]+qihao[5]+qihao[6]+qihao[7]
# 蓝色球
# blueqiu = r'<td class="blueColor sz12" >(.*?)</td>'
# bluepattern = re.compile(blueqiu, re.I | re.S | re.M)
# blueqiu = bluepattern.findall(resHtml)
blueqiu = qihao[8]
except error.URLError as e:
print(e)
if __name__ == "__main__":
proxy = {"http": "118.31.220.3:8080"}
proxy_support = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_support)
request.install_opener(opener)
url = "http://zst.aicai.com/ssq/openInfo/"
shuangseqiu(url)