python爬取京东python书籍信息
直接上代码镇宅。。。。。。。。。。。。。。。。。。。。。。。。。
需要用到的库:requests lxml pymongo
需要安装这些库,并且配置好mongodb数据库。
import requests
from lxml import etree
import time
import pymongo
class JdSpider(object):
url = 'https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=2.his.0.0&page=%d&s=%d&click=0'
next_url = 'https://search.jd.com/s_new.php?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&suggest=2.his.0.0&page=%d&s=%d&scrolling=y&log_id=1569070263.40778&tpl=2_M&show_items='
page = 1
s = 3
# 拼接前30本书籍的url
def parse_next_page(self):
next_page_url = self.url % (self.page, self.s) # 1 3
next_url = self.next_url % (self.page + 1, self.s + 23) # 2 23
self.page += 1
self.s += 50
if self.page <= 100:
return next_page_url,next_url
#发送第一次请求,获取前30本书籍的响应
def start_request(self,url):
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'referer': url,
}
response = requests.get(url,headers = headers)
response.encoding = 'utf-8'
#字符串响应
response_str = response.text
#xpath响应
response = etree.HTML(response.text)
return response,response_str
# 解析书籍信息,返回书籍信息列表
def parse_book(self,response_str):
#替换掉多余的html代码,利于xpath解析
response_str = response_str.replace('<font class="skcolor_ljg">','').replace('</font>','')
response_xpath1 = etree.HTML(response_str)
book_names = response_xpath1.xpath("//li//div[@class='p-name']/a/em/text()")
response_str = response_str.replace('<em>','').replace('</em>','').replace('<i>','').replace('</i>','')
response_xpath2 = etree.HTML(response_str)
book_prices = response_xpath2.xpath("//li//div[@class='p-price']/strong/text()")
book_message_list = []
for book_name,book_price in zip(book_names,book_prices):
book_message = {
'书籍名称':book_name,
'书籍价格':book_price
}
book_message_list.append(book_message)
return book_message_list
# 解析构造后30本书籍的url
def parse_url(self, response):
url_data = response.xpath("//ul[@class='gl-warp clearfix']/li/@data-sku")
url_data = ','.join(url_data)
url,next_url = self.parse_next_page()
next_url = next_url + url_data
return next_url
# 数据去重
def data_clear(self,book_message_list):
#去掉书籍价格为空的两组数据
#可将列表转换为set集合去重复
new_message_list = []
for book in book_message_list:
if book['书籍价格'] == '¥':
book.clear()
if book != {}:
new_message_list.append(book)
return new_message_list
# 爬取60本书籍的信息
def start_spider(self):
#发送第一次请求,获取前30本书籍的信息
url,next_url = self.parse_next_page()
response,response_str = self.start_request(url)
book_message_list1 = self.parse_book(response_str)
#发送第二次请求,得到后30本书籍的信息
next_url = self.parse_url(response)
#获取字符串对象,xpath对象
response2 ,response_str = self.start_request(next_url)
book_message_list2 = self.parse_book(response_str)
book_message_list = book_message_list1 + book_message_list2
book_message_list = self.data_clear(book_message_list)
return book_message_list
#数据存储到本地
# for book_message in book_message_list:
# with open('京东python书籍信息.csv','a+') as fp:
# fp.write(str(book_message) + '\n')
# fp.close()
#将数据存入mongodb
class Save_data(object):
def __init__(self):
host = "127.0.0.1"
port = 27017
dbname = "jdbooks"
sheetname = "book"
# 创建MONGODB数据库链接
client = pymongo.MongoClient(host=host, port=port,)
client.admin.authenticate("数据库名称", "密码", mechanism='SCRAM-SHA-1')
# 指定数据库
mydb = client[dbname]
# 存放数据的数据库表名
self.post = mydb[sheetname]
def process(self, message_list):
for message in message_list:
self.post.insert(message)
return True
a = JdSpider()
b = Save_data()
i = int(input("请输入要获取的页数:",))
for i in range(i):
time.sleep(2)
book_message_list = a.start_spider()
b.process(book_message_list)
print('存储成功')
京东官网对于爬虫还是比较友好的,没有特别复杂的反爬机制。
主要是,ajax动态刷新页面,发现url规律并拼接构造参数,离用xpath提取信息,还有headers中的refer参数执行请求来源的url,这个可以直接在构造的headers中直接添加该参数,详细的看代码吧,有基础的很容易看明白。