爬虫02-淘宝数据采集

"""
__title__ = ''
__author__ = 'Thompson'
__mtime__ = '2018/7/24'
# code is far away from bugs with the god animal protecting
    I love animals. They taste delicious.
              ┏┓      ┏┓
            ┏┛┻━━━┛┻┓
            ┃      ☃      ┃
            ┃  ┳┛  ┗┳  ┃
            ┃      ┻      ┃
            ┗━┓      ┏━┛
                ┃      ┗━━━┓
                ┃  神兽保佑    ┣┓
                ┃　永无BUG！   ┏┛
                ┗┓┓┏━┳┓┏┛
                  ┃┫┫  ┃┫┫
                  ┗┻┛  ┗┻┛
"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import re
from lxml import etree
import json


#创建WebDriver对象
browser = webdriver.Chrome()
#等待变量
wait = WebDriverWait(browser,10)
try:
    browser.get('https://www.taobao.com/')  # 打开淘宝首页
    tb_input = wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
    )  # 等待输入框加载完成
    search_btn = wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))
    )  # 等待搜索按钮加载完成
    tb_input.send_keys('iphone X')  # 输入框中传入“美食”
    search_btn.click()  # 点击搜索

    html = browser.page_source
    #print(html)
    pat = re.compile(r'g_page_config = (.*?}});')
    matchObj = pat.search(html)
    if matchObj != None:
        conf = json.loads(matchObj.group(1))
    print(conf)
    print(type(conf))
    with open("./data/iphonex.json", "w", encoding='utf-8') as f:
    #     # indent 超级好用，格式化保存字典，默认为None，小于0为零个空格
    #     #f.write(json.dumps(matchObj.group(1), indent=4))
         json.dump(conf, f, indent=4)  # 和上面的效果一样
    # html = etree.HTML(browser.page_source)
    itemlist = conf["mods"]["itemlist"]["data"]["auctions"]
    for i in range(len(itemlist)):
        print("店铺：",itemlist[i]['nick'])
        print("item_loc：", itemlist[i]['item_loc'])
        print("pic_url：", itemlist[i]['pic_url'])
        print("评论数：", itemlist[i]['comment_count'])
        print("详情页面：", itemlist[i]['detail_url'])
        print("标题：", itemlist[i]['raw_title'])
        print("标题：", itemlist[i]['title'])
        print("价格：", itemlist[i]['view_price'])
        print('='*80)
except TimeoutException as e:
    print(e)

browser.close()
爬虫02-淘宝数据采集

猜你喜欢