列表页
import time
import pymongo
import requests
import csv
#数据存储本地csv
# def save(list):
# print(list)
# csvFile = open(fr'{ky}.csv', 'a', newline='', encoding='utf-8-sig') # 设置newline,否则两行之间会空一行
# writer = csv.writer(csvFile)
# writer.writerow(list)
# csvFile.close()
#配置mongodb
def save(result):
client = pymongo.MongoClient('localhost', 27017)
dbname = client['spider']
MONGO_TABLE = dbname['snlb']
MONGO_TABLE.insert(result)
def spider(ky,end_page):
url_list = [f'https://fast.suning.com/search/emall/mobile/clientSearch.jsonp?set=5&ps=10&channelId=MOBILE&keyword={ky}&st=0&ci=&cf=&sc=&cp={page}&iv=-1&ct=-1&sp=&spf=&prune=1&sg=1&operate=0&istongma=1&sesab=ABAABABB&nonfirst=1&v=1.27&snyp=0&jzq=17708&lx='for page in range(end_page+1)]
#遍历全部商品
try:
for url in url_list:
r = requests.get(url, verify=False)
if len(r.text) < 1000:
break
else:
data_list = r.json()['goods']
for data in data_list:
try:
#获取商品信息字段
title = data["catentdesc"]
price = data["price"]
shop_id = data["salesCode"]
shop_name = data["salesName"]
goods_id = data["catentryId"]
detail_url = f'https://product.suning.com/{shop_id}/{goods_id}.html'
praiseRate = data["praiseRate"]
image = 'https:'+ data["dynamicImg"]
comment = data["extenalFileds"]["commentShow"]
# info = [title, price, shop_name, shop_id, goods_id, praiseRate, comment, image, detail_url]
info = ({'name':ky,'title':title, 'price':price, 'shop_name':shop_name, 'shop_id':shop_id, 'goods_id':goods_id,'praiseRate': praiseRate,'comment': comment,'image':image,'detail_url': detail_url})
save(info) #保存到Mongo数据库中
except:
pass
except:
print('出错了')
# with open('苏宁详情链接.txt', 'a') as f:
# f.write(detail_url)
#
# with open('苏宁详情链接.txt', 'r') as fp:
# for line in fp:
# line = line.strip('\n')
# print(line)
if __name__ == '__main__':
ky = input('请输入需要爬取的商品名:')
start = time.time()
end_page = int(input('请输入要爬取的结束页(1-600):'))
# save(['标题', '价格', '店铺名', '店铺id', '商品id', '好评率', '评价数', '商品展示', '详细页面'])
spider(ky,end_page)
print('商品爬取完毕')
end = time.time()
print(end - start)
详情页
import csv
import time
from selenium import webdriver
import pymongo
ky = ""
def get_detail(ky):
# ky = input("请输入搜索商品链接:")
urls = ky.split(';') # 通过'https:'对整体url进行分割
# print(urls)
for i in urls: # 同时对所有的url进行遍历
url = i # 分离后形成单个的url加上'https:'以便访问
# print(url)
headers = {'User-Agent': 'Mozilla/5.0 (Windnyi ows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
options = webdriver.FirefoxOptions()
options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
browser = webdriver.Firefox(options=options)
# browser = webdriver.Chrome()# 启用谷歌浏览器内核
# browser = webdriver.Firefox() # 启用火狐浏览器内核
try:
browser.get(url) # 浏览器加入url访问
except:
continue
# browser.maximize_window()#最大化窗口
try:
button = browser.find_element_by_id('sufei-dialog-close') # 产生点击事件,关闭登录框
button.click()
except:
pass
# time.sleep(1) # 等待页面动态内容加载完毕
# wait = WebDriverWait(browser, 5) #等待页面动态内容加载完毕
# 展示图一
try:
img1 = browser.find_element_by_xpath(
'//*[@id="imgZoom"]/div[3]/div/ul/li[1]/a/img').get_attribute('src')
# img1 = img1.replace('60x60', '430x430')
except:
img1 = ' '
# print('展示图一:', img1)
# 展示图二
try:
img2 = browser.find_element_by_xpath(
'//*[@id="imgZoom"]/div[3]/div/ul/li[2]/a/img').get_attribute('src')
# img2 = img2.replace('60x60', '430x430')
except:
img2 = ' '
# print('展示图二:', img2)
# 展示图三
try:
img3 = browser.find_element_by_xpath(
'//*[@id="imgZoom"]/div[3]/div/ul/li[3]/a/img').get_attribute('src')
# img3 = img3.replace('60x60', '430x430')
except:
img3 = ' '
# print('展示图三:', img3)
# 展示图四
try:
img4 = browser.find_element_by_xpath(
'//*[@id="imgZoom"]/div[3]/div/ul/li[4]/a/img').get_attribute('src')
# img4 = img4.replace('60x60', '430x430')
except:
img4 = ' '
# print('展示图四:', img4)
# 展示图五
try:
img5 = browser.find_element_by_xpath(
'//*[@id="imgZoom"]/div[3]/div/ul/li[5]/a/img').get_attribute('src')
# img5 = img5.replace('60x60', '430x430')
except:
img5 = ' '
# print('展示图五:', img5)
# 详情网址
# print('详情网址:', url)
# 商品标题
try:
title = browser.find_element_by_xpath('//*[@id="itemDisplayName"]').text
title = title.split('\n')
except:
title = ' '
# print('商品标题:', title)
# 店铺名称
try:
set_meal = browser.find_element_by_xpath('/html/body/div[18]/div[1]/div[1]/div[2]/dl[1]/dd/a').text
except:
set_meal = ' '
# print('店铺名称:', set_meal)
# 发货地点
try:
address = browser.find_element_by_xpath('//*[@id="shopName"]/span').text
except:
address = ' '
# print('发货地点:', address)
# 产品价格
try:
price = browser.find_element_by_class_name("mainprice").text
except:
price = ' '
# print('产品价格:', price)
# 评价人数
try:
evaluation = browser.find_element_by_xpath('//*[@id="productCommTitle"]').text
# evaluation1 = evaluation1.replace('累计评价', '')
except:
evaluation = ' '
# print('评价人数:', evaluation)
# 属性套餐
try:
combo = browser.find_element_by_class_name('tzm').text
combo = combo.split('\n')
combo = combo[0:-1:1]
except:
combo = ' '
# print('属性套餐:',combo)
# with open('苏宁商品详情信息.csv', 'a', newline='', encoding='utf-8-sig') as fp:
# csv_writer = csv.writer(fp, delimiter=',')
# csv_writer.writerow([
# url, img1, img2, img3, img4, img5, title, set_meal, address, price, evaluation,combo,
# ])
# try:
# info = [url, img1, img2, img3, img4, img5, title, set_meal, address, price, evaluation, combo]
# save(info)
# except:
# pass
try:
info = ({'detail_url':url,'img1': img1, 'img2':img2,'img3': img3,'img4': img4,'img5': img5,'title': title,
'set_meal':set_meal, 'address':address,'price': price, 'evaluation':evaluation, 'combo':combo})
save(info)
except:
pass
# 关闭内核浏览器
browser.close()
browser.quit()
#苏宁信息存储
# def save(list):
# csvFile = open('苏宁商品详情信息.csv', 'a', newline='', encoding='utf-8-sig') # 设置newline,否则两行之间会空一行
# writer = csv.writer(csvFile)
# writer.writerow(list)
# csvFile.close()
#配置mongodb
def save(result):
client = pymongo.MongoClient('localhost', 27017)
dbname = client['spider']
MONGO_TABLE = dbname['snxq']
MONGO_TABLE.insert(result)
if __name__ =='__main__':
# csv文件标题
# headers = ['详情链接','展示图1','展示图2','展示图3','展示图4','展示图5','商品标题','店铺名称','发货地点','产品价格','评价人数','套餐属性']
# save(headers)
get_detail(ky)