import requests from bs4 import BeautifulSoup import re import xlwt class DoubanMusic: def __init__(self,pageIndex): self.pageIndex = 0 self.user_agent = 'Mozila/5.0' self.headers = {'User-agent':self.user_agent} self.music_list = [] def getHtml(self): try: url = 'https://music.douban.com/top250?' + str(self.pageIndex) r = requests.get(url, self.headers) r.encoding = 'utf-8' r.raise_for_status() return r.text except: return '' def parseHtml(self): soup = BeautifulSoup(self.getHtml(), 'html.parser') for music in soup.findAll('tr',{'class':'item'}): musicUrl = music.a['href'] name = music.a.get_text().strip() detail = music.find('p',{'class':'pl'}).get_text().strip('/') singer = detail[0].strip() date = detail[1].strip() source = detail[2].strip() home = detail[3].strip() singeType = detail[4].strip() score = music.find('span',{'class':'rating_nums'}).get_text().strip('(').strip(')').strip() words = music.find('span',{'class':'pl'}).get_text().strip() self.music_list.append([name,score,words,musicUrl]) def load(self,datalist): file = xlwt.Workbook() sheet = file.add_sheet('豆瓣音乐Top2500', cell_overwrite_ok=True) col = (u'歌名',u'歌手',u'得分',u'啥么') for i in range(0,3): sheet.write(0,i,col[i]) for i in range(0,250): data = datalist[i] for j in range(0,3): sheet.write(i+1,j,data[j]) file.save('豆瓣音乐Top2500.xls') def start(self): print('开始为您抓取豆瓣音乐Top250') while(self.pageIndex<=225): print('正在抓取第%d页'%(self.pageIndex/25+1)) self.getHtml() self.pageIndex+=25 print("抓取完成") self.load(self.music_list) music = DoubanMusic(0) music.start()
爬取豆瓣音乐Top250并存入xls
猜你喜欢
转载自blog.csdn.net/u010356229/article/details/81005475
今日推荐
周排行