列表页
from selenium import webdriver # 导入webdriver模块
from selenium.webdriver.common.by import By # 导入定位
from selenium.webdriver.support.ui import WebDriverWait # 导入等待
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time # 导入时间模块
import re
print('+'+'-'*38+'+')
print('|'+'1688跨境国家馆东南亚商品'.center(28)+'|')
print('|'+ ' '*38 +'|')
print('|'+'1.女装'.ljust(17)+'2.男装'.ljust(17)+'|')
print('|'+'3.内衣'.ljust(17)+'4.运动户外'.ljust(15)+'|')
print('|'+'5.消费电子'.ljust(15)+'6.鞋包配'.ljust(16)+'|')
print('|'+'7.美妆个护'.ljust(15)+'8.汽摩配'.ljust(16)+'|')
print('+'+'-'*38+'+')
classify = int(input('请选择分类商品(单选):'))
page = int(input('请输入要下滑的次数:'))
def main():
options = webdriver.FirefoxOptions() # 设置火狐浏览器无界面
options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
browser = webdriver.Firefox(options=options)
# browser = webdriver.Firefox() # 调用火狐浏览器
# browser.maximize_window() # 最大化窗口
wait = WebDriverWait(browser, 20) # 渲染等待
URL = 'https://kj.1688.com/58f7310c.html?spm=a262gg.8864560' # 定义url
browser.get(URL) # 浏览器加入url访问
country = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR,'ul.cate-nav-list:nth-child(3) > li:nth-child(3)')))
country.click() # 定位东南亚元素,并点击
time.sleep(1) # 设置延时等待
for i in range(classify,classify+1): # 定位类别位置遍历
li_count1 = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR,'ul.cate-nav-list:nth-child(4) > li:nth-child(%d)' %i)))
li_count1.click() # 定位分类元素,并点击
li_name = browser.find_element_by_css_selector('ul.cate-nav-list:nth-child(4) > li:nth-child(%d)' %i).text
print(li_name) # 通过分类元素取出分类属性文本
for j in range(100000,100000*page,100000):
time.sleep(0.5) # 渲染加载等待
js="var q=document.documentElement.scrollTop= %d " %j # 使用浏览器操作js下滑至指定位置
browser.execute_script(js)
time.sleep(0.5) # 延时等待
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.NervModuleKjSecondCateOffer")))
html = browser.page_source # 页面渲染完毕后生成html
soup = BeautifulSoup(html, 'html.parser') # 使用BeautifulSoup匹配动态加载后的页面
items = soup.select('div.NervModuleKjSecondCateOffer') # 匹配列表页商品元素
# url_list = []
count = 0 # 定义初始值
for item in items: # 遍历商品
count += 1
print('正在获取第%d个商品' % count)
link = item.select('div .titleView > a')[0].get('href')
parent = 'offer%2F(\d+).html'
id = re.findall(parent,link)
id = id[0]
# print('商品 id:',id)
url = 'https://detail.1688.com/offer/'+id+'.html'
# print('详情链接:',url)
title = item.select('div .flt-right > div > div > div:nth-child(2) > div > div > a > span:nth-child(2)')[0].text
# print('商品标题:',title)
inventory = item.select('div .bd-left > div > span')[1].text
# print('商品库存:',inventory)
sales = item.select('div .bd-right > div > a > span')[1].text
# print('商品销量:',sales)
try:
store = item.select('div .flt-right > div > div > div:nth-child(4) > div > div > a > span:nth-child(2)')[0].text
except:
store = ''
# print('店铺名称:',store)
picture = item.select('div .flt-right > div > div > div > div > a > div > img')[0].get('src')
# print('展示主图:',picture)
with open ('东南亚链接.txt','a') as f:
f.write(url)
# try:
# info = [id+'\t',title,inventory,sales,store,url,picture,li_name]
# save(info) # 存储商品信息
# except:
# pass
# time.sleep(1)
# js = "var q=document.documentElement.scrollTop=0"
# browser.execute_script(js) # 使用浏览器操作js回顶部
browser.close()
# def save(list):
# csvFile = open('东南亚商品列表.csv','a', newline='', encoding='utf-8-sig') # 以a的形式追加文件,同时设置newline不会出现空行
# writer = csv.writer(csvFile)
# writer.writerow(list)
# csvFile.close() # 写入完成,关闭文件
if __name__ == '__main__':
# header = ['商品id','商品标题','商品库存','商品销量','店铺名称','详情链接','展示主图','商品类别']
# save(header)
main()
print('商品获取完毕,谢谢使用!')
详情页
from selenium import webdriver # 导入webdriver模块
import time # 导入时间模块
import json # 导入json
import re # 导入正则
import csv # 导入csv表格
import itertools # 导入迭代模块
import gevent #导入协程相关的库
def HandleAll(url_list_index,urls_list,num):
options = webdriver.FirefoxOptions() # 设置火狐浏览器无界面
options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
browser = webdriver.Firefox(options=options)
# browser = webdriver.Firefox() # 调用火狐浏览器
# browser.maximize_window() # 最大化窗口
# wait = WebDriverWait(browser, 20) # 渲染等待
urls_list_num = len(urls_list)
for k in range(url_list_index, urls_list_num, 3):
url = urls_list[k]
if url[0:23] == 'https://detail.1688.com': # 通过url前缀过滤
url_start = time.time()
try:
browser.get(url) # 浏览器加入url访问
except:
continue
gevent.sleep(1)
# try:
# button = browser.find_element_by_xpath('//*[@id="sufei-dialog-close"]') # 产生点击事件,关闭登录框
# button.click()
# except:
# pass
print('现有资源%d个' % num)
print('正在获取第%d个资源' % k)
print()
print('详情链接',url)
parent = 'https://detail.1688.com/offer/(\d+).html'
id = re.findall(parent,url)
id = id[0]
print('商品 id:',id)
try:
title = browser.find_element_by_css_selector('#mod-detail-title > h1').text
except:
title = ''
# print('商品标题:',title)
img_list = [] # 定义空列表
try:
img1 = browser.find_element_by_css_selector(
'li.tab-trigger:nth-child(1) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)').get_attribute('src')
img1 = img1.replace('60x60','600x600')
if img1[0:6] != 'https:':
img1 = 'https:' + img1
except:
img1 = ''
if img1 != '': # 如果不为空,追加数据进列表中
img_list.append(img1)
try:
img2 = browser.find_element_by_css_selector(
'li.tab-trigger:nth-child(2) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)').get_attribute('src')
img2 = img2.replace('60x60', '600x600')
if img2[0:6] != 'https:':
img2 = 'https:' + img2
except:
img2 = ''
if img2 != '':
img_list.append(img2)
try:
img3 = browser.find_element_by_css_selector(
'li.tab-trigger:nth-child(3) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)').get_attribute('src')
img3 = img3.replace('60x60', '600x600')
if img3[0:6] != 'https:':
img3 = 'https:' + img3
except:
img3 = ''
if img3 != '':
img_list.append(img3)
try:
img4 = browser.find_element_by_css_selector(
'li.tab-trigger:nth-child(4) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)').get_attribute('src')
img4 = img4.replace('60x60', '600x600')
if img4[0:6] != 'https:':
img4 = 'https:' + img4
except:
img4 = ''
if img4 != '':
img_list.append(img4)
try:
img5 = browser.find_element_by_css_selector(
'li.tab-trigger:nth-child(5) > div:nth-child(1) > a:nth-child(1) > img:nth-child(1)').get_attribute('src')
img5 = img5.replace('60x60', '600x600')
if img5[0:6] != 'https:':
img5 = 'https:' + img5
except:
img5 = ''
if img5 != '':
img_list.append(img5)
try:
master_map = img_list[0] # 取图片列表第一个元素为商品主图
# print('商品主图:', master_map)
auxiliary_chart = {} # 定义空字典
auxiliary_chart = img_list[1:] # 将列表内第二个数据起划为副图
# print('商品副图:', auxiliary_chart)
except:
pass
json_str1 = json.dumps(auxiliary_chart) # 转换为json数组
# print(json_str1)
try:
store = browser.find_element_by_css_selector('a.name:nth-child(1)').text
except:
store = ''
# print('店铺名称:',store)
brand = ''
# print('商品品牌:',brand)
try:
address = browser.find_element_by_css_selector('.delivery-addr').text
except:
address = ''
# print('发货地址:',address)
try:
price = browser.find_element_by_css_selector('tr.price > td:nth-child(2) > span:nth-child(2)').text
except:
price = ''
# print('商品价格:',price)
try:
sales = browser.find_element_by_css_selector('.bargain-number > a:nth-child(1) > em:nth-child(1)').text
except:
sales = ''
# print('商品销量:',sales)
try:
comment = browser.find_element_by_css_selector('.satisfaction-number > a:nth-child(1) > em:nth-child(1)').text
except:
comment = ''
# print('评价数量:',comment)
try:
collect = browser.find_element_by_css_selector('.unit-detail-fav > span:nth-child(2)').text
collect = collect.replace('(','')
collect = collect.replace(')', '')
except:
collect = ''
# print('收藏人气:',collect)
try:
describe = browser.find_element_by_css_selector('.topbar-bsr > div:nth-child(1) > div:nth-child(2) > div:nth-child(3) > span:nth-child(1)').text
except:
describe = browser.find_element_by_css_selector('.universal-supplier-bsr > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > span:nth-child(1)').text
# print('描述评分:',describe)
try:
shop_paraments = browser.find_element_by_css_selector('#mod-detail-attributes').text
except:
shop_paraments = ''
shop_paraments = shop_paraments.split('\n')
paraments_list = []
for shop_paraments_index in shop_paraments:
paraments_list.append(shop_paraments_index)
# print('商品参数:',paraments_list)
paraments = {}
paraments = paraments_list
json_str3 = json.dumps(paraments)
# print(json_str3)
try:
arrow_down = browser.find_element_by_css_selector('html body div#doc div.content-wrap div#content.page-offerdetail.page-content-sub div.page-offerdetail-content-wrap div#site_content_fluid-box.segment-box.segment-box-fluid div#site_content_fluid.segment.layout.layout-fluid div.region.region-type-fluid div.mod.mod-offerDetailContext1.app-offerDetailContext1.mod-ui-not-show-title div.m-body div.m-content div#J_DetailInside.detail-inside.detail-full-column.area-detail-property div#mod-detail.mod-detail div.grid-full.grid-full-bd div#mod-detail-bd.mod-detail-bd div.region-custom.region-detail-property.region-takla.ui-sortable.region-vertical div.widget-custom.offerdetail_ditto_purchasing div.widget-custom-container div.mod-detail-purchasing.mod-detail-purchasing-multiple div.d-content div.obj-sku div.obj-expand a')
arrow_down.click() # 模拟点击展开更多元素
except:
pass
# 匹配尺码,大小等规格参数
try:
size = browser.find_elements_by_css_selector('.table-sku > tbody:nth-child(1) > tr > td:nth-child(1) > span:nth-child(1)')
except:
size = browser.find_element_by_css_selector('td.name > span:nth-child(1)')
size_list = [] # 定义空列表
for size_index in range(0, len(size)): # 通过列表的长度,对元素添加索引
size_info = size[size_index].text # 将匹配的到的元素列表通过索引取出,并转化为文本
size_list.append(size_info) # 追加元素志列表中
# print('商品规格:',size_list)
# 变体信息
try:
var_info = browser.find_elements_by_css_selector('#mod-detail-bd > div.region-custom.region-detail-property.region-takla.ui-sortable.region-vertical > div.widget-custom.offerdetail_ditto_purchasing > div > div > div > div.obj-leading > div.obj-content > ul > li > div > a > span')
except:
pass
var_info_list = [''] # 定义空列表
for var_info_index in range(0, len(var_info)): # 通过列表的长度,对元素添加索引
info1 = var_info[var_info_index].text # 将匹配的到的元素列表通过索引取出,并转化为文本
var_info_list.append(info1) # 追加元素志列表中
# print('var_info_list:',var_info_list)
# time.sleep(2) # 设置延时等待
var_info_anthor_list = ['']
# 变体图片一
try:
var_info1 = browser.find_element_by_css_selector('a.selected > span:nth-child(1) > span:nth-child(1) > img:nth-child(1)').get_attribute('alt')
except:
var_info1 = ''
if var_info1 != '':
var_info_anthor_list.append(var_info1)
# print('var_info1:', var_info1)
# 变体图片二
try:
var_info2 = browser.find_element_by_css_selector(
'.list-leading > li:nth-child(2) > div:nth-child(1) > a:nth-child(1) > span:nth-child(1) > span:nth-child(1) > img:nth-child(1)').get_attribute('alt')
except:
var_info2 = ''
if var_info2 != '':
var_info_anthor_list.append(var_info2)
# print('var_info2:', var_info2)
# 变体图片三
try:
var_info3 = browser.find_element_by_css_selector(
'.list-leading > li:nth-child(3) > div:nth-child(1) > a:nth-child(1) > span:nth-child(1) > span:nth-child(1) > img:nth-child(1)').get_attribute('alt')
except:
var_info3 = ''
if var_info3 != '':
var_info_anthor_list.append(var_info3)
# print('var_info3:', var_info3)
# 变体图片四
try:
var_info4 = browser.find_element_by_css_selector(
'.list-leading > li:nth-child(4) > div:nth-child(1) > a:nth-child(1) > span:nth-child(1) > span:nth-child(1) > img:nth-child(1)').get_attribute('alt')
except:
var_info4 = ''
if var_info4 != '':
var_info_anthor_list.append(var_info4)
# print('var_info4:', var_info4)
# 变体图片五
try:
var_info5 = browser.find_element_by_css_selector(
'.list-leading > li:nth-child(5) > div:nth-child(1) > a:nth-child(1) > span:nth-child(1) > span:nth-child(1) > img:nth-child(1)').get_attribute('alt')
except:
var_info5 = ''
if var_info5 != '':
var_info_anthor_list.append(var_info5)
# print('var_info_anthor_list:',var_info_anthor_list)
set_meal = itertools.product(var_info_list,var_info_anthor_list,size_list)
var_set_meal = list(set_meal)[::]
Arr_list = [] # 定义数组空列表
Arr_dict = {} # 定义数组空字典
for k in var_set_meal: # 对迭代出来的元素进行遍历
k_list = [x for x in k] # 通过列表推导式将二维数组列表内的元祖定义为列表
if '' in k_list: # 判断二维数组内空值
j_list = list(set(k_list)) # 通过set集合进行去重,并重新定义为列表
j_list.sort(key=k_list.index)
j_list.remove('') # 去重后,通过值去除列表内剩下的空元素
if len(j_list) > 1:
Arr_list.append(j_list) # 将生成后的列表追加进空列表中
new_list = [list(t) for t in set(tuple(_) for _ in Arr_list)] # 通过列表推导式对列表及元素去重
new_list.sort(key=Arr_list.index) # 重新排序
try:
new_list = new_list[0:20] # 超出20,取数组内前20个列表
except:
new_list = new_list[::] # 没有20个,默认取所有
Arr_dict = new_list # 将切片后的数组赋值给空字典
if new_list == []:
Arr_dict = list(set(size_list))
json_str2 = json.dumps(Arr_dict) # 转换json数组
# print(json_str2)
Arr = json.loads(json_str2) # json数组转换python数组
# print(Arr)
# try:
# weight = browser.find_element_by_css_selector('.detail-other-attr-content').text
# weight = weight.split('\n')
# except:
# weight = ''
# print('商品重量:',weight)
# json_str4 = json.dumps(weight)
url_end = time.time()
print('资源耗时:',url_end - url_start)
print()
try:
show_info = [id+'\t',address,sales]
product_save(show_info)
except:
pass
try:
info = [id+'\t', title, master_map , store, brand, address, price, sales, comment, collect, describe,
url, json_str1, json_str2, json_str3]
deatils_save(info) # 文件存储
except:
pass
browser.close()
def deatils_save(list2):
csvFile = open('1688东南亚商品详情.csv', 'a', newline='',encoding='utf-8-sig') # 以a的形式追加文件,同时设置newline不会出现空行
writer = csv.writer(csvFile)
writer.writerow(list2)
csvFile.close() # 写入完成,关闭文件
def product_save(list1):
csvFile = open('1688东南亚商品.csv', 'a', newline='', encoding='utf-8-sig') # 以a的形式追加文件,同时设置newline不会出现空行
writer = csv.writer(csvFile)
writer.writerow(list1)
csvFile.close() # 写入完成,关闭文件
if __name__ == '__main__':
print('数据读取中,耐心等待,请勿关闭......')
start = time.time()
headers = ['商品 ID', '商品标题', '商品主图', '店铺名称', '品牌名称', '发货地点', '产品价格', '当月销量', '评价人数',
'收藏人气', '评分描述', '详情链接', '商品副图', '变体信息', '商品参数'] # csv文件标题
deatils_save(headers) # 写入文件标题
header = ['商品 ID', '发货地点', '付款人数']
product_save(header) # 写入文件标题
with open('东南亚链接.txt','r') as fp:
for line in fp:
line = line.strip('\n')
file = open('东南亚链接.txt', 'r') # 打开文本
try:
strings = file.read() # 将读取到的内容定义为字符串keyword
finally:
file.close() # 关闭文件
num = strings.count('http') # 通过http个数计算资源总数
# print('共有资源%d个' % num) # 打印资源总数
urls_list = [] # 定义存放urls链接的空列表
urls = strings.split('https:') # 通过'https:'对整体url进行分割,得到每一个元素都是独立的
for i in urls: # 同时对所有的url进行遍历
URL = 'https:' + i # 分离后形成单个的url加上'https:'以便访问
urls_list.append(URL)
url = '' # 定义url为空
gevent.joinall([
gevent.spawn(HandleAll, 1, urls_list, num), # 加入参数url,urls列表初始索引值,urls_list列表
gevent.spawn(HandleAll, 2, urls_list, num),
gevent.spawn(HandleAll, 3, urls_list, num)
])
end = time.time()
print('程序共耗时:',end - start)