有点瑕疵,需要改进
#1.获取网页源代码
#2.解析数据,获取到自己想要的内容
#3.保存数据
import re
import time
import requests
import csv
for i in range(0,25):
n=i*25
url='https://movie.douban.com/top250?start='+str(n)
header={
"Cookie": 'bid=W9LftjwdnG0; douban-fav-remind=1; __gads=ID=41bcea934a3c4395-22fb6b939ec900d8:T=1624083400:RT=1624083400:S=ALNI_MYPXBu1mjn03x8CNBfjvfsq0-vsUQ; _ga=GA1.2.909847905.1624083402; ll="108309"; _vwo_uuid_v2=DD671F298AFD7FA9272C53146D518B3E0|c2f0060def89c140abb182e52b239d49; _ga=GA1.3.909847905.1624083402; __utmz=30149280.1635140575.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ct=y; dbcl2="246515829:7icG4avgWbE"; push_doumail_num=0; push_noty_num=0; __utmv=30149280.24651; __utmz=223695111.1635827335.12.4.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ck=eDr-; ap_v=0,6.0; __utmc=30149280; __utmc=223695111; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1636517977%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.909847905.1624083402.1636513873.1636517977.19; __utma=223695111.909847905.1624083402.1636513873.1636517977.16; __utmb=223695111.0.10.1636517977; __utmt=1; __utmb=30149280.6.9.1636519264670; _pk_id.100001.4cf6=f2581a16f3a4e759.1634360027.16.1636519267.1636513873.',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
}
response=requests.get(url,headers=header)
# print(response)
response.encoding='utf-8' #对网页源代码转义成中文
html=response.text #得到网页源代码
#制定正则规则
patter=re.compile(r'<li>.*? <span class="title">(?P<name>.*?)</span>.*?'
r' <div class="bd">.*?<p class="">.*?<br>(?P<year>.*?) .*?</p>.*?'
r'<span class="rating_num" property="v:average">(?P<ratingnum>.*?)</span>.*?'
r'<span>(?P<preson>.*?)人评价</span>.*? '
r'</div>.*?<p class="quote">.*?<span class="inq">(?P<stence>.*?)</span>.*?</p>.*?<p>',re.S)
result=patter.finditer(html) #通过正则匹配,匹配出自己想要的内容,返回一个迭代器
with open("daoban.csv",mode='a',newline='') as f: #防止覆盖,将mode=‘w’改为‘a’,直接在后面最加,但你得先创建出daoban。csv这个文件
csvwriter=csv.writer(f) #创建一个写入的对象
for i in result:
# print(i.group('name'))
# print(i.group('year').strip())
# print(i.group('ratingnum'))
# print(i.group('preson'))
dict=i.groupdict() #将匹配出的结果输出成字典
dict['year']=dict['year'].strip() #对年份进行规范处理
# if len(dict['stence'])==0:
# dict['stence']=null #因为我想要匹配到的一句话概述,有些影片没有,有些影片有,为了防止覆盖而设置,但是结果依然解决不了,所有这是本代码有问题的小点
csvwriter.writerow(dict.values())
# print(dict)
time.sleep(0.01) #,爬取一个页面休眠一会,模拟用户行为
print("ok")
运行结果: