def askurl(url):
head={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"}
response = urllib.request.Request(url=url,headers=head) #封装request对象
content = urllib.request.urlopen(response) #打开网页
html = content.read().decode('utf-8') #解码
return html #返回
findLink = re.compile('<a href="(.*?)">') #电影链接
findImgsrc = re.compile(r'<img.*src="(.*?)"',re.S) #图片链接
findTitle = re.compile(r'<span class="title">(.*)</span>') #标题
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findJudge = re.compile(r'<span>([\d]*人评价)</span>')
findInq = re.compile(r'<span class="inq">(.*)</span>')
findBd = re.compile(r'<p class="">(.*?)</p>',re.S)
def add(datalist,b,item):
datalist.append(re.findall(b,item)[0])
return datalist
def getdata(baseurl):
datalist=[]
for i in range(0,10):
url = baseurl + str(i*25)
html = askurl(url)
soup = BeautifulSoup(html,'html.parser')
for item in soup.find_all('div',class_='item'): #item包含了我们想要的全部信息
item=str(item)
data=[]
add(data,findLink,item) #保存电影超链接
add(data,findImgsrc,item) #保存电影图片链接
text = re.findall(findTitle,item) #保存标题
if len(text)==1: #只有一个标题,留空位
data.append(text[0])
data.append(" ")
else:
data.append(text[0])
text[1] = text[1].replace("/","")
data.append(text[1])
add(data,findRating,item)
add(data,findJudge,item)
inq = re.findall(findInq,item)
if len(inq)!=0:
inq = inq[0].replace("。","") #去掉电影概述的句号
data.append(inq)
else:
data.append(" ")
bd = re.findall(findBd,item)[0]
bd = re.sub('<br(\s+)/>(\s+)?',' ',bd) #去掉<br/>
bd = re.sub('/',' ',bd) #去掉/
data.append(bd.strip()) #去掉前后空格
datalist.append(data)
return datalist
def savedata(datalist,savepath): #datalist是抓取到所有有用的信息,savepath是保存路径
book = xlwt.Workbook(encoding='utf-8') #创建excel对象
sheet = book.add_sheet('豆瓣电影top250',cell_overwrite_ok=True)
col = ('电影详情链接','图片链接','影片中文名','影片外国名','评分','评价数','概况','相关信息')
for i in range(0,8):
sheet.write(0,i,col[i]) #在第一行写下标题
for i in range(0,250): #循环250个电影的信息
print("第%d条"%i)
data = datalist[i]
for j in range(0,8):
sheet.write(i+1,j,data[j]) #填写数据
book.save(savepath) #保存数据