"""
__title__ = ''
__author__ = 'Thompson'
__mtime__ = '2018/7/24'
# code is far away from bugs with the god animal protecting
I love animals. They taste delicious.
┏┓ ┏┓
┏┛┻━━━┛┻┓
┃ ☃ ┃
┃ ┳┛ ┗┳ ┃
┃ ┻ ┃
┗━┓ ┏━┛
┃ ┗━━━┓
┃ 神兽保佑 ┣┓
┃ 永无BUG! ┏┛
┗┓┓┏━┳┓┏┛
┃┫┫ ┃┫┫
┗┻┛ ┗┻┛
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import re
from lxml import etree
import json
#创建WebDriver对象
browser = webdriver.Chrome()
#等待变量
wait = WebDriverWait(browser,10)
try:
browser.get('https://www.taobao.com/') # 打开淘宝首页
tb_input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
) # 等待输入框加载完成
search_btn = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))
) # 等待搜索按钮加载完成
tb_input.send_keys('iphone X') # 输入框中传入“美食”
search_btn.click() # 点击搜索
html = browser.page_source
#print(html)
pat = re.compile(r'g_page_config = (.*?}});')
matchObj = pat.search(html)
if matchObj != None:
conf = json.loads(matchObj.group(1))
print(conf)
print(type(conf))
with open("./data/iphonex.json", "w", encoding='utf-8') as f:
# # indent 超级好用,格式化保存字典,默认为None,小于0为零个空格
# #f.write(json.dumps(matchObj.group(1), indent=4))
json.dump(conf, f, indent=4) # 和上面的效果一样
# html = etree.HTML(browser.page_source)
itemlist = conf["mods"]["itemlist"]["data"]["auctions"]
for i in range(len(itemlist)):
print("店铺:",itemlist[i]['nick'])
print("item_loc:", itemlist[i]['item_loc'])
print("pic_url:", itemlist[i]['pic_url'])
print("评论数:", itemlist[i]['comment_count'])
print("详情页面:", itemlist[i]['detail_url'])
print("标题:", itemlist[i]['raw_title'])
print("标题:", itemlist[i]['title'])
print("价格:", itemlist[i]['view_price'])
print('='*80)
except TimeoutException as e:
print(e)
browser.close()
爬虫02-淘宝数据采集
猜你喜欢
转载自blog.csdn.net/qwerLoL123456/article/details/83143102
今日推荐
周排行