爬虫小练习,面向对象,和函数式编程,爬取豆瓣电影

 1 class Grasp:
 2     def __init__(self):
 3         for i in range(0, 10):
 4             self.url = f"https://movie.douban.com/top250?start={25*i}&filter="
 5             self.html = urlopen(self.url).read().decode()
 6             self.htmlobj = et.HTML(self.html)
 7             self.res = self.htmlobj.xpath("//div[@class ='hd']/a/span[@class='title'][1]/text()")  # 电影名
 8             self.dicr = self.htmlobj.xpath("//div[@class ='bd']/p[1]/text()")  # 导演
 9             self.cri = self.htmlobj.xpath("//p[@class ='quote']/span//text()")  # 介绍
10             self.score = self.htmlobj.xpath("//div[@class ='star']/span[@class='rating_num']/text()")  # 评分
11             print((''.join(self.dicr)).strip().replace(' ', '').replace('/', '').split(',')[0:1]) 12 def getName(self): 13 pass 14 def getDir(self): 15 pass 16 def getScore(self): 17 pass 18 def getCri(self): 19 pass 20 21 def run(self): 22 wb = xlwt.Workbook(encoding='utf-8') 23 ws = wb.add_sheet('豆瓣电影') 24 for i in range(0,10): 25 for j in range(0, len(self.res)): 26 ws.write(i, j * 4, (''.join(self.res[j])).strip().replace(' ', '').replace('/', '')) 27 ws.write(i, 4 * j + 1, (''.join(self.dicr[j * 2])).strip().replace('/', '').split(',')[0:1]) 28 try: 29 ws.write(i, 4 * j + 2, (''.join(self.cri[j])).strip()) 30 except: 31 ws.write(i, 4 * j + 2,'没有介绍') 32 ws.write(i, 4 * j + 3, (''.join(self.score[j])).strip().replace(' ', '').replace('/', '')) 33 wb.save('./豆瓣电影/movie.xls') 34 35 r = Grasp() 36 r.run() 37 38 39 40 def reader(): 41 wb = xlwt.Workbook(encoding='utf-8') 42 ws = wb.add_sheet('豆瓣电影') 43 for i in range(0, 10): 44 url = f"https://movie.douban.com/top250?start={25*i}&filter=" 45 html = urlopen(url).read().decode() 46 htmlobj = et.HTML(html) 47 res = htmlobj.xpath("//div[@class ='hd']/a/span[@class='title'][1]/text()") #电影名 48 dicr = htmlobj.xpath("//div[@class ='bd']/p[1]/text()") #导演 49 cri = htmlobj.xpath("//p[@class ='quote']/span//text()") #介绍 50 score = htmlobj.xpath("//div[@class ='star']/span[@class='rating_num']/text()") #评分 51 for j in range(0, len(res)): 52 d = ((''.join(dicr[j*2])).strip().replace(' ', '').replace('/', '')).find('主') 53 ws.write(i, j*4, (''.join(res[j])).strip().replace(' ','').replace('/','')) 54 ws.write(i, 4*j+1, ((''.join(dicr[j*2])).strip().replace(' ', '').replace('/', ''))[0:d]) 55 try: 56 ws.write(i, 4*j+2, (''.join(cri[j])).strip()) 57 except: 58 ws.write(i, 4 * j + 2,'没有介绍') 59 ws.write(i, 4*j+3, (''.join(score[j])).strip().replace(' ', '').replace('/', '')) 60 wb.save('./豆瓣电影/movie.xls') 61 62 reader()
将爬取的数据存储为表格
''.join()这些是为了将数据转换为单纯的字符串,除去特殊的字符和空格,便于数据的查看
此外,需要注意的是xpath获取的是一个列表,可以用列表的方法进行操作,不需要进行多余的转化

猜你喜欢

转载自www.cnblogs.com/superSmall/p/11502872.html