版权声明:自学笔记,如有引用请标明博客,感谢 https://blog.csdn.net/feng_jlin/article/details/82386758
# -*- coding:utf-8 -*
import urllib.request as urlrequest
from bs4 import BeautifulSoup
import time #休息时间
import random #为了时间随机
import bs4 #注意点1:引入模块
import re
#读取链接并解析
def urlAsk(url_visit):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} #表头防封,360浏览器
headers_url_visit = urlrequest.Request(url=url_visit, headers=headers) #将防封表头写入链接
crawl_content = urlrequest.urlopen(headers_url_visit).read() #读取链接
http_content = crawl_content.decode('utf8') #因为有中文,把格式改为utf8
soup = BeautifulSoup(http_content,'html.parser') #用beautifulsoup解析网页
return soup
#获取TOP250页面内容,并判断空值
def data_append(find_re_append,each_item_divs):
re_append = re.findall(find_re_append,each_item_divs)
if not len(re_append): #以防有些内容为空,无法爬取
data.append('无')
else:
data.append(re_append[0])
return data
#获取book详情页信息
def get_data_book(url_book):
soup_book = urlAsk(url_book)
soup_book = str(soup_book)
find_re_author = re.compile(r'<span class="pl">作者:</span>.*?<a href="https://book.douban.com/author/.*?/">(.*?)</a>.*?<br/>',re.S) #获取评分
#去掉无关内容
#remove=re.compile(r'\n (.*?)\n (.*?)',re.S)
re_author = re.findall(find_re_author,soup_book)[0]
re_author = re.sub('\n ',"",re_author,re.S)
re_author = re.sub('\n ',"",re_author,re.S)
#re_author = re_author.replace('\n ',"").replace('\n ',"")
return re_author
#
book_top250_url = "https://book.douban.com/top250?start={}"
###def getData(book_top250_url):
find_re_herf = re.compile(r'<a href="(.*?)" onclick') #获取链接
find_re_title = re.compile(r'quot;" title="(.*?)">') #获取名称
find_re_ratingnums = re.compile(r'<span class="rating_nums">(.*?)</span>') #获取评分
find_re_population = re.compile(' (.*?)评价') #获取评价人数
find_re_inq = re.compile('<span class="inq">(.*?)</span>') #获取简介
datalist = []
for i in range(10):
start = i*25
url_visit = book_top250_url.format(start)
soup = urlAsk(url_visit)
all_item_divs = soup.find_all(class_="item") #找到所有class=item,形成all_item_divs的列表
for each_item_divs in all_item_divs:
data = [] #初始data列表,用于存到excel中
each_item_divs = str(each_item_divs) #转换成字符串
data_append(find_re_herf,each_item_divs) #函数data_append
data_append(find_re_title,each_item_divs) #函数data_append
data_append(find_re_ratingnums,each_item_divs) #函数data_append
data_append(find_re_population,each_item_divs) #函数data_append
data_append(find_re_inq,each_item_divs)
mm = get_data_book(data[0]) #函数get_data_book
print(mm)
#将相关数据写入excel中
def saveData(datalist,savepath):
book=xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet=book.add_sheet('豆瓣图书Top250',cell_overwrite_ok=True)
col=('电影详情链接','图片链接','影片中文名','影片外国名',
'评分','评价数','概况','导演','主演','年份','地区','类别')
for i in range(0,12):
sheet.write(0,i,col[i])#列名
for i in range(0,250):
data=datalist[i]
for j in range(0,12):
sheet.write(i+1,j,data[j])#数据
book.save(savepath)#保存
def main():
book_top250_url='https://movie.douban.com/top250?start='
datalist=getData(baseurl)
savapath=u'豆瓣电影Top250.xlsx'
saveData(datalist,savapath)