爬取豆瓣音乐Top250并存入xls

import requests
from bs4 import BeautifulSoup
import re
import xlwt

class DoubanMusic:
    def __init__(self,pageIndex):
        self.pageIndex = 0
        self.user_agent = 'Mozila/5.0'
        self.headers = {'User-agent':self.user_agent}
        self.music_list = []

    def getHtml(self):
        try:
            url = 'https://music.douban.com/top250?' + str(self.pageIndex)
            r = requests.get(url, self.headers)
            r.encoding = 'utf-8'
            r.raise_for_status()
            return r.text
        except:
            return ''

    def parseHtml(self):
        soup = BeautifulSoup(self.getHtml(), 'html.parser')
        for music in soup.findAll('tr',{'class':'item'}):
            musicUrl = music.a['href']
            name = music.a.get_text().strip()
            detail = music.find('p',{'class':'pl'}).get_text().strip('/')
            singer = detail[0].strip()
            date = detail[1].strip()
            source = detail[2].strip()
            home = detail[3].strip()
            singeType = detail[4].strip()
            score = music.find('span',{'class':'rating_nums'}).get_text().strip('(').strip(')').strip()
            words = music.find('span',{'class':'pl'}).get_text().strip()
            self.music_list.append([name,score,words,musicUrl])


    def load(self,datalist):
        file = xlwt.Workbook()
        sheet = file.add_sheet('豆瓣音乐Top2500', cell_overwrite_ok=True)
        col = (u'歌名',u'歌手',u'得分',u'啥么')
        for i in range(0,3):
            sheet.write(0,i,col[i])
        for i in range(0,250):
            data = datalist[i]
            for j in range(0,3):
                sheet.write(i+1,j,data[j])
        file.save('豆瓣音乐Top2500.xls')

    def start(self):
        print('开始为您抓取豆瓣音乐Top250')
        while(self.pageIndex<=225):
            print('正在抓取第%d'%(self.pageIndex/25+1))
            self.getHtml()
            self.pageIndex+=25
        print("抓取完成")
        self.load(self.music_list)

music = DoubanMusic(0)
music.start()


猜你喜欢

转载自blog.csdn.net/u010356229/article/details/81005475