import requests from bs4 import BeautifulSoup import time import re import json import csv urls=[] tc=['名字','评分','导演','演员','时长'] with open('C:\\Users\\lenovo\\Desktop\\go1.csv', 'a+', newline='', encoding='utf-8')as f: writers = csv.writer(f) writers.writerow(tc) header={'Host':'movie.douban.com', 'Referer':'https://movie.douban.com/explore', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} for i in range(25): url='https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start={}'.format(i*20)#由于翻页网址不变,则从开发者工具查找 response=requests.get(url,headers=header) ds=json.loads(response.text)#response都是JSON格式的loads函数将字典转为字符串 for j in range(20): d=ds['subjects'][j]['url']#json格式,找出网址 urls.append(d) listw=[] response = requests.get(d, headers=header) soup=BeautifulSoup(response.text,'html.parser') time.sleep(10) name=soup.find('span',{'property':'v:itemreviewed'}) score=soup.find('strong',{'property':'v:average'}) daoyan=soup.find('span',{'class':'attrs'}) star=soup.find_all('span',{'class':'attrs'}) for i in star: c=i.find_all('a',{'rel':'v:starring'}) for i in c: listw.append(i.text) shijian=soup.find('span',{'property':'v:runtime'}) with open('C:\\Users\\lenovo\\Desktop\\go1.csv', 'a+', newline='', encoding='utf-8')as f: writers=csv.writer(f) h=[name.text,score.text,daoyan.text,listw,shijian.text] writers.writerow(h)
爬取豆瓣高分电影
猜你喜欢
转载自www.cnblogs.com/persistence-ok/p/10949339.html
今日推荐
周排行