python爬36选7并保存为.csv

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_39071593/article/details/84980570
import re
import csv
import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
} #伪装成360浏览器

def gethtml(url):     #最基本requests库使用因为BeautifulSoup要解析的是html.text,所以不要只返回html
    html = requests.get(url,headers = headers)
    html.encoding = html.apparent_encoding
    return html.text  #因为BeautifulSoup要解析的是html.text,所以不要只返回html

def space(string):   #用来分开中奖号码
    result = re.sub(r"(?<=\w)(?=(?:\w\w)+$)", " ", string)   
    return result    #返回等等要打印的结果

def soup(html):    #BeautifulSoup解析
    soup = BeautifulSoup(html,'lxml')
    a = soup.body
    b = a.find_all(attrs={'class':"td-luckyno"})  
    for div in b:
        dict = {'first':'','second':'','third':'','fourth':'','fifth':'','sixth':'','special':''}
        string = space(div.attrs['luckyno'])   #调用spcae()方法,提取div中luckyno属性中的数字
        str = string.split()
        i = 0
        for k,v in dict.items():
            dict[k] = str[i]
            i = i + 1
        print(dict)
        if dict['first']!='':
            writer.writerow(dict)

if __name__=="__main__":
    with open('36选7.csv', 'w', newline='') as csvfile:
        fieldnames = ['first','second','third','fourth','fifth','sixth','special']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        #这里要缩进,不然就I/O operation on closed file. 翻译:在关闭的文件I / O操作。
        num = 1
        for i in range(1):   #因为只是实验仅仅供参考,所以不完全爬,150多页就ok了
            soup(gethtml('http://www.gdfc.org.cn/datas/history/367/history_'+str(num)+'.html'))
            num += 1
    print('成功!!!')

猜你喜欢

转载自blog.csdn.net/qq_39071593/article/details/84980570