url参数分析
eg:
我们看到的:
https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&sort=T&range=0,10&tags=电影,爱情,日本,女性&start=20
编码后:
https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1,%E7%88%B1%E6%83%85,%E6%97%A5%E6%9C%AC,%E5%A5%B3%E6%80%A7&start=20
1. sort
排序方式,有三种:
T:热度排序,
R:时间排序,
S:评价排序:
2.range=0,10
评分范围
3.tags
影视形式,类型,地区,特色
4.其它,可以不管
playbale=1:表示可播放
unwatched=1:表示还没看过的
遇到的问题
同一IP地址访问受限的问题,抛出403错误,网上搜到三种解决方法:
1.使用User_Agent,仿造浏览器访问 headers
User_Agents =[
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0(Windows NT 10.0;Win64 x64)AppleWebkit/537.36(KHTML,like Gecko) chrome/58.0.3029.110 Safari/537.36'
]
resp=requests.get(url,headers=random.choice(User_Agents))
2.伪造Cookie,解封豆瓣IP
网友说仿造豆瓣生成cookie的格式,真实性未知
jar = requests.cookies.RequestsCookieJar()
jar.set('bid', 'ehjk9OLdwha', domain='.douban.com', path='/')
jar.set('11', '25678', domain='.douban.com', path='/')
resp=requests.get(url,cookie=jar)
3.使用代理IP proxies
网上找高匿IP地址
Agent_IP = ['http://121.31.155.140','http://121.31.150.47','http://118.190.95.35','http://61.135.217.7',\
'http://118.190.95.43','http://115.46.76.188','http://121.31.155.140']
resp=requests.get(url,proxies=random.choice(Agent_IP))
代码
我根据自己的需要修改了抓取信息处理的部分,删掉了Mongo数据库
import logging
import random
import string
import requests
import time
from collections import deque
from urllib import parse
import pandas as pd
import re
from settings import User_Agents,Agent_IP
class DoubanSpider(object):
"""豆瓣爬虫"""
def __init__(self,form,Type,country,genres):
# 基本的URL
self.base_url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&'
self.full_url = self.base_url + '{query_params}'
# 从User-Agents中选择一个User-Agent
self.headers = {'User-Agent':random.choice(User_Agents)}
self.proxies = {'http':random.choice(Agent_IP)}
# 可选参数
self.form_tag = form # 影视形式
self.type_tag = Type # 类型
self.countries_tag = country # 地区
self.genres_tag = genres #特色
#默认参数
self.sort = 'T' # 排序方式,默认是T,表示热度
self.range = 0, 10 # 评分范围
def encode_query_data(self):
"""对输入信息进行编码处理"""
if not (self.form_tag and self.type_tag and self.countries_tag and self.genres_tag):
all_tags = ''
else:
all_tags = [self.form_tag, self.type_tag, self.countries_tag, self.genres_tag]
query_param = {
'sort': self.sort,
'range': self.range,
'tags': all_tags,
}
# string.printable: ASCII字符就不用编码了
query_params = parse.urlencode(query_param, safe=string.printable)
# 去除查询参数中无效的字符
invalid_chars = ['(', ')', '[', ']', '+', '\'']
for char in invalid_chars:
if char in query_params:
query_params = query_params.replace(char, '')
# 把查询参数和base_url组合起来形成完整的url
self.full_url = self.full_url.format(query_params=query_params) + '&start={start}'
def download_movies(self, offset):
"""下载电影信息
:param offset: 控制一次请求的影视数量
:return resp:请求得到的响应体"""
full_url = self.full_url.format(start=offset)
print(full_url)
resp = None
try:
resp=requests.get(full_url,headers=self.headers,proxies=self.proxies)
except Exception as e:
print(resp)
logging.error(e)
return resp
def get_movies(self, resp):
"""获取电影信息
:param resp: 响应体
:return movies:爬取到的电影信息"""
if resp:
if resp.status_code == 200: #拒绝访问会返回403
# 获取响应文件中的电影数据
movies = dict(resp.json()).get('data')
if movies:
# 获取到电影了,
return movies
else:
# 响应结果中没有电影了!
# print('已超出范围!')
return None
else:
# 没有获取到电影信息
return None
def save_movies(self, movies):
"""把请求到的电影保存到csv文件中
:param movies:提取到的电影信息
"""
#判断爬取的网页是否为空
if len(str(movies)) < 20 :
return False
#分词
words = re.findall(pattern=r'\d\.\d|\w+(?:[ ,\-:·!。\?\(\)]?\w*)*',string=str(movies))
#提取信息,生成字典
items = []
flag = True
for word in words:
if word == 'cover':
item['directors'] = item['directors'][:-1]
item['casts'] = item['casts'][:-1]
items.append(item)
key = None
flag = True
if flag:
item = {'directors':'','rate':0,'title':None,'casts':'','form':self.form_tag,\
'Type':self.type_tag,'country':self.countries_tag,'genres':self.genres_tag}
flag = False
if word in item.keys(): key = word
elif key != None:
if key in ['rate','title']:
item[key] = word
key = None
else:
item[key] += word
item[key] += '|'
#保存字典
frame = pd.DataFrame.from_dict(items)
frame.to_csv('../data/movie.csv',index=0,header=0,mode='a') #不保留索引,不保留标题,追加写入
return True
def main():
"""豆瓣电影爬虫程序入口"""
form_tags = ['电影','电视剧','综艺','动画','纪录片','短片']
Type_tags = ['剧情','喜剧','动作','爱情','科幻','悬疑','惊悚','恐怖','犯罪','同性','音乐','歌舞','传记','历史',\
'战争','西部','奇幻','冒险','灾难','武侠','情色']
country_tags = ['中国大陆','美国','香港','台湾','日本','韩国','英国','法国','德国','意大利','西班牙','印度',\
'泰国','俄罗斯','伊朗','加拿大','澳大利亚','爱尔兰','瑞典','巴西','丹麦']
genres_tags = ['经典','青春','文艺','搞笑','励志','魔幻','感人','女性','黑帮']
for form in form_tags:
for Type in Type_tags:
for country in country_tags:
for genres in genres_tags:
print(form,Type,country,genres)
# 1. 初始化工作,设置请求头等
spider = DoubanSpider(form=form,Type=Type,country=country,genres=genres)
# 2. 对信息进行编码处理,组合成有效的URL组合成有效的URL
spider.encode_query_data()
offset = 0
flag = True
while flag:
# 3. 下载影视信息
reps = spider.download_movies(offset)
print(reps)
# 4.提取下载的信息
movies = spider.get_movies(reps)
# 5. 保存数据到csv文件
flag = spider.save_movies(movies)
print(offset,flag)
offset += 20
# 控制访问速度(很重要,如果访问太快被封IP就不能继续爬了)
time.sleep(5)
time.sleep(100)
if __name__ == '__main__':
main()