版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_39071593/article/details/84980570
import re
import csv
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
} #伪装成360浏览器
def gethtml(url): #最基本requests库使用因为BeautifulSoup要解析的是html.text,所以不要只返回html
html = requests.get(url,headers = headers)
html.encoding = html.apparent_encoding
return html.text #因为BeautifulSoup要解析的是html.text,所以不要只返回html
def space(string): #用来分开中奖号码
result = re.sub(r"(?<=\w)(?=(?:\w\w)+$)", " ", string)
return result #返回等等要打印的结果
def soup(html): #BeautifulSoup解析
soup = BeautifulSoup(html,'lxml')
a = soup.body
b = a.find_all(attrs={'class':"td-luckyno"})
for div in b:
dict = {'first':'','second':'','third':'','fourth':'','fifth':'','sixth':'','special':''}
string = space(div.attrs['luckyno']) #调用spcae()方法,提取div中luckyno属性中的数字
str = string.split()
i = 0
for k,v in dict.items():
dict[k] = str[i]
i = i + 1
print(dict)
if dict['first']!='':
writer.writerow(dict)
if __name__=="__main__":
with open('36选7.csv', 'w', newline='') as csvfile:
fieldnames = ['first','second','third','fourth','fifth','sixth','special']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
#这里要缩进,不然就I/O operation on closed file. 翻译:在关闭的文件I / O操作。
num = 1
for i in range(1): #因为只是实验仅仅供参考,所以不完全爬,150多页就ok了
soup(gethtml('http://www.gdfc.org.cn/datas/history/367/history_'+str(num)+'.html'))
num += 1
print('成功!!!')