python爬虫之selenium模拟浏览器爬取淘宝美食

主要代码如下:

# -*- coding: utf-8 -*-
"""
Created on Tue Jul 17 20:17:34 2018
利用selenium+正则表达式 模拟chrome浏览器爬淘宝美食信息
重点学习selenium库的用法和思想

利用PhantomJS 构造无界面浏览器, 使浏览器后台运行
Mongodb数据库存储数据, 包括用到pymongo库
@author: Administrator
"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import re
from pyquery import PyQuery as pq

from config import *  #配置文件 配置相关 关键字信息,如搜索关键字、数据库名称,当然也可直接加到 主代码中
print(MONGO_DB)
import pymongo
from pymongo import MongoClient

client = MongoClient(MONGO_URL)  #数据库声明 连接
db = client[MONGO_DB]  #

#from selenium.webdriver.common.keys import Keys


browser = webdriver.Chrome()
#browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) #无界面浏览器窗口
#browser.set_window_size(1400,900)

wait = WebDriverWait(browser, 10)
def search(Keyword):
    print('正在搜索')
    try: 
        browser.get("http://www.taobao.com")
        inputinfo= wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))) #输入框
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, \
'#J_TSearchForm > div.search-button > button')))  #提交按键
        inputinfo.send_keys(Keyword)
        submit.click()
        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,\
 '#mainsrp-pager > div > div > div > div.total')))
        get_products()
        return total.text
    except TimeoutException:
        return search()
    
def next_page(page_number):
    print('正在翻页')
    try:
         inputinfo= wait.until(EC.presence_of_element_located(
                 (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form \
> input'))) #输入框
         submit = wait.until(EC.element_to_be_clickable(
                 (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form >\
 span.btn.J_Submit')))  #提交按键
         inputinfo.clear()
         inputinfo.send_keys(page_number)
         submit.click()
         wait.until(EC.text_to_be_present_in_element(
                 (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul >\
 li.item.active > span'), str(page_number)))
         get_products()
    except TimeoutException:
        return next_page(page_number)
    
def get_products():
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,\
'#mainsrp-itemlist .items .item')))
    html = browser.page_source                          #获得网页原代码
    doc = pq(html)                                      #pyquery库,解析网页
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product = {
                'image': item.find('.pic .img').attr('src'), #属性attr
                'price': item.find('.price').text()[2:], 
                'deal': item.find('.deal-cnt').text()[:-3],
                'title': item.find('.title' ).text(),
                'shop': item.find('.shop').text(),
                'location': item.find('.location').text()
                }
        save_to_mongo(product)
        print(product)

def save_to_mongo(result):
#==============================================================================
## 去掉try except结构,可用找出异常
#     if db[MONGO_TABLE].insert(result):
#         print('存储到MONGODB成功',result)
#
#==============================================================================
    try:
        if db[MONGO_TABLE].insert(result):
            print('存储到MONGODB成功',result)
    except Exception:
        print('存储到MONGODB失败',result)      
         
def main():
    try:
        total = search(Keyword)
        #print(type(total))
        total= int(re.compile('(\d+)').search(total).group(1))
        print(total)
        for i in range(3,total-93):
            next_page(i)
            
    finally:
        browser.close()
    
if __name__ == '__main__':
    main()
    

config.py配置文件 代码如下:

# -*- coding: utf-8 -*-
"""
Created on Thu Jul 19 10:27:20 2018
config.py文件
@author: Administrator
"""

#数据库配置
#MONGO_URL = 'localhost'
#MONGO_URL = 'mongodb://mongodb0.example.net:27019'

MONGO_URL = ['localhost:27017'] #参照MongDB文档 API Documentation
MONGO_DB = 'taobao'
MONGO_TABLE = 'product'

Keyword = '美食'

#无界面浏览器配置
SERVICE_ARGS=['--load-images=false','--disk-cache=true']

猜你喜欢

转载自blog.csdn.net/haoha210/article/details/81179415