【爬虫】京东商品连接

# -*- coding: utf-8 -*-
from __future__ import division
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from threading import Thread
from pyquery import PyQuery as pq
import chardet
import copy
import xlwt
import os
import mPing
import datetime
import xlwt
from xlrd import open_workbook
now_time = time.strftime('%H-%M-%S', time.localtime(time.time()))
print now_time
# print chardet.detect(now_time)
# print chardet.detect(time_now_time)
#xls_name = ("京东爬虫数据.xls").decode("utf-8")
xls_name = ("京东爬虫数据"+str(now_time)+".xls").decode("utf-8")
#print type(xls_name)
#print "京东爬虫数据"+str(now_time)+".xls"
title = ["链接", "名称", "价格", "晒图", "好评", "中评", "差评", "全部评价"]
urllist = ["https://item.jd.com/11936238.html",
           "https://item.jd.com/11841674.html"
           ]
URLSource = "京东URL.txt".decode('utf-8')
if os.path.isfile(URLSource):
    print "发现URL文件,准备开始爬虫".decode('utf-8')
else:
    print "亲!!! 当前目录下的url文件:   \"".decode('utf-8')+URLSource+"\"    不存在,请添加后再运行".decode('utf-8')
    exit(1)


def msleep1():
    time.sleep(1)


def msleep2():
    print "...2",
    time.sleep(1)
    print "...1",
    time.sleep(1)
    print "...0"


def msleep3():
    print "5",
    time.sleep(1)
    print "...4",
    time.sleep(1)
    print "...3",
    time.sleep(1)
    print "...2",
    time.sleep(1)
    print "...1",
    time.sleep(1)
    print "...0"


def warnningtext():
    return "这里无法正确获取数据(偶尔网速问题会影响一两个数据),请手动检查,如果是代码问题请联系开发修改".decode("utf-8")


def cannotgetdataprint(text):
    print ("无法获取"+text+" 请手动检查一下然后联系开发人员").decode('utf-8')


def mprint(str):
    #print  "",
    print "#############   " + str.decode('utf-8') + "   #############"


def debugprint(str):
    print  "",   #不换行空输出   "" 后面加 ,
    print "[email protected]@@   " + str.decode('utf-8')


def totwrite(str):
    return str.decode('utf-8')

# mPing.mNetPing('jd.com')

# chromeOptions = webdriver.ChromeOptions()
# prefs = {"profile.managed_default_content_settings.images":2}
# chromeOptions.add_experimental_option("prefs",prefs)
# driver = webdriver.Chrome(chrome_options=chromeOptions)

prefs = {"profile.managed_default_content_settings.images":2}
option = webdriver.ChromeOptions()
option.add_argument("test-type")#不显示警告
option.add_experimental_option("prefs",prefs)#不显示图片
global timesurl
timesurl = 1
global webdriver_chrome
#webdriver_chrome = webdriver.PhantomJS()#phantomjs无法加载ajax 所以这里不能用 还是要用chrome来模拟动态的加载
webdriver_chrome = webdriver.Chrome(chrome_options=option)
#webdriver_chrome.set_window_size(2000,2000)

def isUrlBefore():
    pass#打开url后地址是否被跳转 如果跳转那就跳过该地址并写入警告

def isString(isstr, data):
    if isstr in str(data.encode("utf-8")):
        return True
    else:
        return False


def openweb(url):
    global  starttime
    global driver_wait
    global isOffsale
    COUNTINUE = False
    SKIP = 1
    TIAOZHUAN = 2
    LOADERROR = 3
    FATALERROR = 4

    mprint("努力加载链接中,请耐心等待")
    try:
        try:#获取源码进行判断
            respone = requests.get(url)
            #正确打开连接
            isOffsale = False #初始化设置为不下柜
            if respone.status_code == 200:#正确加载价格页面包括下柜的页面
                if "商品评价" in str(respone.text.encode("utf-8")):#说明页面正常访问到商品页面  否则可能被跳转了
                    # print respone.text
                    isOffsale = False
                    if "商品已下柜" in str(respone.text.encode("utf-8")):
                        isOffsale = True
                else:
                    return TIAOZHUAN #说明页面不是价格页面  被跳转了?
            else:#无法打开连接
                return LOADERROR#状态码不是200说明访问有问题
        except Exception, e:
            print Exception, e#无法获取源码
            return FATALERROR
    #以下代码应该不会被执行
        webdriver_chrome.get(url)
        # mprint("获取当前地址")
        if "?c" in getcurrenturl():#有了上面的if "商品评价" in判断后这段代码应该不会被执行到
            mprint("地址已经被跳转")
            return SKIP
        driver_wait = WebDriverWait(webdriver_chrome, 10)
        return COUNTINUE
    except Exception:
        mprint("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!请注意,链接有问题 无法打开 程序可能停止!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print url
        print getcurrenturl()
        return SKIP
    finally:
        debugprint("打印url")


def get_element_bycssselector(css_selector):
    element = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
    # print element.text
    return element


def get_datanum_bycssselectorlist(css_selector_list, text):
    for css_selector in css_selector_list:
        try:
            # print css_selector
            element = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
            data_num = element.get_attribute('data-num')
            if isString(text, element.text):
                print element.text + ":" + str(data_num)  # mprint ("显示好评")
                return data_num
            else:
                mprint("无法获取")
        except:
            pass
    return warnningtext()


def get_element_byxpathlist(xpath_list, text):
    for xpath in xpath_list:
        try:
            element = driver_wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
            # print element.text
            if isString(text, element.text):
                print element.text
                return element
            else:
                mprint("无法获取xpath如下")
                print xpath
        except:
            mprint(xpath)
            pass
    return None

# def try_element(element):
#     try:
#         element
#     except:
#         pass



def getname():
    debugprint("start find name btn")
    try:
        myname = webdriver_chrome.find_element_by_class_name('sku-name')
        mprint("1名称:")
        print myname.text
        return myname.text
    except Exception:
        pass
    try:
        myname = webdriver_chrome.find_element_by_css_selector('#name > h1')
        mprint("2名称:")#生鲜 书籍
        print myname.text
        return myname.text
    except Exception:
        pass
    try:
        myname = webdriver_chrome.find_element_by_css_selector('#name')
        mprint("3名称:")#生鲜 书籍
        print myname.text
        return myname.text
    except Exception:
        mprint("第 3次 抓取商品名称失败")
        return warnningtext()


def getprice():
    debugprint("start getprice")
    try:
        myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.summary.summary-first > div > div.summary-price.J-summary-price > div.dd > span')))
        mprint("1价格:")
        # print myprice.text
        finalprice = myprice.text.encode ('utf-8').replace ('', '')
        if finalprice == "":
            msleep1()
            finalprice = myprice.text.encode ('utf-8').replace ('', '')
            if finalprice == "":
                msleep2 ()
                finalprice = myprice.text.encode ('utf-8').replace ('', '')
                if finalprice == "":
                    msleep3 ()
                    finalprice = myprice.text.encode ('utf-8').replace ('', '')
        print finalprice
        return finalprice
    except Exception:#估计下架 做下架的抓取
        pass
    try:  # 生鲜 书籍 抓取价格
        myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#jd-price")))  # 生鲜 可用
        # myprice = webdriver_chrome.find_element_by_xpath("/html/body/div[7]/div/div[2]/div[3]/div/div[1]/div[2]/span/span[2]")
        mprint("2价格:")
        # print myprice.text
        finalprice = myprice.text.encode ('utf-8').replace ('', '')
        if finalprice == "":
            msleep1 ()
            finalprice = myprice.text.encode ('utf-8').replace ('', '')
            if finalprice == "":
                msleep2 ()
                finalprice = myprice.text.encode ('utf-8').replace ('', '')
                if finalprice == "":
                    msleep3 ()
                    finalprice = myprice.text.encode ('utf-8').replace ('', '')
        print finalprice
        return finalprice
    except Exception:  # 估计下架 做下架的抓取
        pass
    try:  # 生鲜 书籍 抓取价格
        myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.summary-price.J-summary-price > div > div.dd > span > span")))  # 生鲜 可用
        mprint("3价格:")
        # print myprice.text
        finalprice = myprice.text.encode ('utf-8').replace ('', '')
        if finalprice == "":
            msleep1 ()
            finalprice = myprice.text.encode ('utf-8').replace ('', '')
            if finalprice == "":
                msleep2 ()
                finalprice = myprice.text.encode ('utf-8').replace ('', '')
                if finalprice == "":
                    msleep3 ()
                    finalprice = myprice.text.encode ('utf-8').replace ('', '')
        print finalprice
        return finalprice
    except Exception:  # 估计下架 做下架的抓取
        pass

    try:  # 下架的抓取  前面判断了下架 这里基本上不会执行了
        mprint("4下架:")
        soldout = webdriver_chrome.find_element_by_class_name('itemover-tip')  # 抓下柜 下架 “该商品已下柜,欢迎挑选其他商品!”

        print  soldout.text

        return soldout.text
    except Exception:
        mprint("抓不到价格 也不是下架 请检查")
        return warnningtext()


def scrolldown():
    debugprint("准备开始滚动500")
    webdriver_chrome.execute_script("window.scrollBy(0,500)")
    debugprint("已向下滚动500")


def clickcommentbtn():
    xpath1 = '//*[@id="detail"]/div[1]/ul/li[5]'
    xpath2 = '//*[@id="detail"]/div[1]/ul/li[4]'
    # xpath3 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)'
    btn = get_element_byxpathlist([xpath1, xpath2], "商品评价")
    if btn is not None:
        try:
            btn.click()
            # mprint("xpath点击")
        except Exception, e:
            mprint("btn非空 不过点击失败了 一般不会这样的 报错是否是:Element is not clickable at point (697, 299). Other element would receive the click")
            print Exception, e
    else:
        # pass#其他判断  基本上不会到这里
        css_sele1 = '# detail > div.tab-main.large > ul > li:nth-child(4)'
        css_sele2= '#detail > div.tab-main.large > ul > li.current'
        try:
            get_element_bycssselector(css_sele1).click()
            mprint("通过csssele获取到")
            print css_sele1
        except:
            try:
                get_element_bycssselector(css_sele2).click()
                mprint("通过csssele获取到")
                print css_sele1
            except:
                mprint("实在找不到 联系开发 程序可能终止")

    """
    try:#1#detail > div.tab-main.large > ul > li.current > s
        mysumcommentbtn = webdriver_chrome.find_element_by_xpath ('//*[@id="detail"]/div[1]/ul/li[5]')
        mprint("1点击")
        print mysumcommentbtn.text,  # 三个按钮的链接要用其他的(运动户外类)
        # mprint("运动户外类?")
        if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")):
            mysumcommentbtn.click()
            mprint("~~~~~~点击了按钮") #    这句有问题
            return True
        else:
            mprint("找不到按钮 商品评价  继续寻找2")
    except:
        pass

    try:#2

        mysumcommentbtn = webdriver_chrome.find_element_by_xpath ('//*[@id="detail"]/div[1]/ul/li[4]')
        mprint("2点击")
        print mysumcommentbtn.text
        if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")):
            mysumcommentbtn.click()
            mprint("~~~~~~点击了评论总量按钮")
            return True
        else:
            mprint("找到按钮 不是商品评价  继续寻找3")
    except:
        mprint("2点击找不到继续下一步")
        pass

    try:#3
        css_sele = '# detail > div.tab-main.large > ul > li:nth-child(4)'  # 香蕉
       # http: // item.jd.com / 11461683.html
        mysumcommentbtn = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        mprint("3点击")
        print mysumcommentbtn.text
        if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")):
            mysumcommentbtn.click()
            mprint("~~~~~~点击了评论总量按钮")
            return True
    except:
        mprint("找不到按钮 商品评价  继续寻找4 ")
        pass


    try:#4
        css_sele = '#detail-tab-comm'  # 书籍类比较多
        mysumcommentbtn = driver_wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele)))
        mprint("4点击")
        print mysumcommentbtn.text
        if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")):
            mysumcommentbtn.click()
            mprint("~~~~~~点击了评论总量按钮")
            return True
    except:
        mprint("找不到按钮 商品评价  继续寻找5")
        pass
    try:#5
        css_sele = '#detail > div.tab-main.large > ul > li.current'  # 香蕉 书籍
        # http: // item.jd.com / 11461683.html
        mysumcommentbtn = driver_wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele)))
        mprint("5点击")
        print mysumcommentbtn.text
        if "商品评价" in str(mysumcommentbtn.text.encode("utf-8")):
            mysumcommentbtn.click()
            mprint("~~~~~~点击了评论总量按钮")
            return True
        else:
            mprint("第五次也找不到 只能手动找了")
            print getcurrenturl()
            return warnningtext()

    except:
        mprint("无法找到商品评价按钮 请联系开发 提供url:")
        print getcurrenturl()
        return warnningtext()
        """


def getshowpicnum():
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)'
    css_sele2 = '#comments-list > div.mt > div > ul > li:nth-child(2)'
    for i in range(3):#循环查找3次
        pic_num = get_datanum_bycssselectorlist ([css_sele1, css_sele2], "晒图")
        if pic_num is not None:
            # mprint(pic_num)
            return pic_num
        else:
            # pass
            mprint("shaitu")
            # print u""+str(i+1)+u"次没找到,准备开始第"+str(i+2)+u"次查找"

    """
    global data_num
    global myshowpic
    try:#comments-list > div.mt > div > ul > li:nth-child(2)
                   # comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)
        css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)'
        myshowpic = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        data_num = myshowpic.get_attribute('data-num')
        mprint("1晒图")
        print myshowpic.text,
        if "晒图" in str(myshowpic.text.encode("utf-8")):
            debugprint("第一次判断正确 是晒图按钮")
            if data_num is not None:
                return data_num
            else:
                mprint("晒图的值没有正确加载 5s后再次验证")
                msleep3()
                data_num = myshowpic.get_attribute ('data-num')
                if data_num is not None:
                    mprint("找到晒图值")
                    print myshowpic.text
                    return data_num
                else:
                    mprint ("晒图的值没有正确加载 5s后再次验证")
                    msleep3 ()
                    msleep3 ()
                    data_num = myshowpic.get_attribute ('data-num')
                    if data_num is not None:
                        mprint ("找到晒图值")
                        print myshowpic.text
                        return data_num
                    else:#多次查找无法找到值
                        mprint("#多次查找无法找到值")
                        return warnningtext()
        else:
            debugprint("第一次判断错误 按钮找到不是晒图 联系开发提供截图")
    except:
        debugprint("第一次判断没找到按钮 开始第二次")

    try:
        css_sele = '#comments-list > div.mt > div > ul > li:nth-child(2)'
        myshowpic = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        mprint("2晒图")
        print myshowpic.text
        if "晒图" in str(myshowpic.text.encode("utf-8")):
            debugprint("第2次判断正确 是晒图按钮")
            if myshowpic.get_attribute('data-num') is not None:
                return myshowpic.get_attribute('data-num')
            else:
                mprint ("晒图的值没有正确加载 5s后再次验证")
                msleep3 ()
                data_num = myshowpic.get_attribute ('data-num')
                if data_num is not None:
                    mprint ("找到晒图值")
                    print myshowpic.text
                    return data_num
                else:
                    mprint ("晒图的值没有正确加载 5s后再次验证")
                    msleep3 ()
                    msleep3 ()
                    data_num = myshowpic.get_attribute ('data-num')
                    if data_num is not None:
                        mprint ("找到晒图值")
                        print myshowpic.text
                        return data_num
                    else:  # 多次查找无法找到值
                        return warnningtext ()
        else:
            debugprint("第2次判断错误 按钮找到不是晒图 联系开发提供截图")
    except:
        debugprint("第2次判断没找到按钮 联系开发")
        return warnningtext()

    """


def totalcomment():
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.current'
    css_sele2 = '#comments-list > div.mt > div > ul > li.ui-switchable-item.trig-item.curr'
    return get_datanum_bycssselectorlist([css_sele1, css_sele2], "全部评价")
    """
    try:
        css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.current'
        mypositivecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        # mypositivecomment = webdriver_chrome.find_element_by_css_selector("#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)")
        data_num = mypositivecomment.get_attribute('data-num')
        mprint("1全部评价")
        print mypositivecomment.text, data_num
        if "全部评价" in str(mypositivecomment.text.encode("utf-8")):
            debugprint("第1次判断正确 是全部评价按钮")
            if data_num is not None:
                return data_num
            else:
                mprint("全部评价的值没有正确加载 请手动查找")
                return cannotgetdataprint(mypositivecomment.text)
        else:
            debugprint("第1次判断错误 按钮找到不是全部评价 联系开发提供截图")
    except:
        debugprint("第一次抓全部评价失败 继续第二次")
        pass
    try:
        css_sele = '#comments-list > div.mt > div > ul > li.ui-switchable-item.trig-item.curr'
        mypositivecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        # mypositivecomment = webdriver_chrome.find_element_by_css_selector("#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)")
        data_num = mypositivecomment.get_attribute('data-num')
        mprint("2全部评价")
        print mypositivecomment.text, data_num
        if "全部评价" in str(mypositivecomment.text.encode("utf-8")):
            debugprint("第2次判断正确 是全部评价按钮")
            if data_num is not None:
                return data_num
            else:
                mprint("全部评价的值没有正确加载 请手动查找")
                return cannotgetdataprint(mypositivecomment.text)
        else:
            debugprint("第2次判断错误 按钮找到不是全部评价 联系开发提供截图")
    except:
        debugprint("第2次抓全部评价失败 继续第二次")
        return cannotgetdataprint("全部评价")
"""


def getpositivecomment():
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(4)'
    css_sele2 = '#comments-list > div.mt > div > ul > li:nth-child(3)'
    css_sele3 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)'
    return get_datanum_bycssselectorlist([css_sele1, css_sele2, css_sele3], "好评(")
"""
    try:
        mypositivecomment = get_element_bycssselector(css_sele1)
        data_num = mypositivecomment.get_attribute('data-num')
        mprint("1好评")
        if isString("好评(", mypositivecomment.text):
            print mypositivecomment.text + ":" + str(data_num)  # mprint ("显示好评")
            return data_num
        else:
            mprint("好评数量无法获取")
    except:
        debugprint("第一次抓好评失败 继续第二次")
        pass

    try:#书籍 香蕉
        mypositivecomment = get_element_bycssselector(css_sele2)
        data_num = mypositivecomment.get_attribute('data-num')
        mprint("2好评")
        if isString("好评(", mypositivecomment.text):
            print mypositivecomment.text + ":" + str(data_num)  # mprint ("显示好评")
            return data_num
        else:
            mprint("好评数量无法获取")
    except:
        pass
    try:#??

        mypositivecomment = get_element_bycssselector(css_sele3)
        data_num = mypositivecomment.get_attribute('data-num')
        if isString("好评(", mypositivecomment.text):
            mprint ("第3次获取到好评")
            print mypositivecomment.text + ":" + str(data_num)  # mprint ("显示好评")
            return data_num
        else:
            mprint("好评数量无法获取")
        print mypositivecomment.text + ":" + str(data_num)  # mprint ("显示好评")
    except:
        mprint("无法获取到好评")
        return warnningtext()
"""


def getmoderatecomment():
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(5)'
    css_sele2 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)'
    return get_datanum_bycssselectorlist([css_sele1, css_sele2], "中评(")

    """
    try:
        css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(5)'
        mymoderatecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        # mymoderatecomment = webdriver_chrome.find_element_by_css_selector(
        #     "#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)")
        data_num = mymoderatecomment.get_attribute('data-num')
        mprint("1中评")
        print mymoderatecomment.text + ":" + str(data_num)  # mprint("显示中评")
        return data_num

    except:
        pass
    try:
        css_sele = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)'
        mymoderatecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        # mymoderatecomment = webdriver_chrome.find_element_by_css_selector(
        #     "#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)")
        data_num = mymoderatecomment.get_attribute('data-num')
        print mymoderatecomment.text + ":" + str(data_num)  # mprint("显示中评")
        mprint("2中评")
        return data_num
    except:
        mprint("第二次中评失败 联系开发")

        """


def getnegativecomment():
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(6)'
    css_sele2 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(5)'
    return get_datanum_bycssselectorlist([css_sele1, css_sele2], "差评(")
    """
    try:
        css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(6)'
        mynegativecomment = driver_wait.until (EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        data_num = mynegativecomment.get_attribute('data-num')
        mprint("1差评")
        print mynegativecomment.text+":"+str(data_num) # mprint ("显示差评")
        return data_num
    except:
        debugprint("第一次差评失败")

    try:
        css_sele = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(5)'
        mynegativecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        data_num = mynegativecomment.get_attribute('data-num')
        print mynegativecomment.text + ":" + str(data_num)  # mprint ("显示差评")
        mprint("2差评")
        return data_num
    except:
        mprint("第2次差评失败 联系开发")
        """


def getaddcomment():#追评
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.J-addComment'
    return get_datanum_bycssselectorlist ([css_sele1, ], "追评(")
    """
    try:
        css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.J-addComment'
        maddcomment = driver_wait.until(EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele)))
        data_num = maddcomment.get_attribute('data-num')
        print maddcomment
        return data_num
    except:
        return "如果前面都没问题 可能这个链接没有追评 可以手动确认".decode("utf-8")
    """


def getcurrenturl():
    # debugprint("打印当前页面url:  "+str(webdriver_chrome.current_url))
    return webdriver_chrome.current_url


def mwrite(linenum, zlist): #放一个 要保存的 行数 和 数据list
    count = len(zlist) #列表数据的长度

    mprint("准备插入第 "+str(linenum+1)+" 条数据,一共:"+str(count)+"")
    title_style = xlwt.easyxf('font: name Times New Roman, color-index red, bold on', num_format_str='#,##0.00')
    if linenum == 0:
        global wb
        global ws
        wb = xlwt.Workbook()
        ws = wb.add_sheet("京东666".decode("utf-8"))
    for i in range(0, count):#列数
        if i == 0:
            mprint("写入如下数据")
        if linenum == 0:#第1条数据待插入  需要先把标题插入0 再把第一条数据插入1
            ws.write(linenum, i, title[i].decode("utf-8"), title_style)#写标题
            ws.write(linenum+1, i, zlist[i])#这个write是一个覆盖操作 如果没write就放空
            print title[i].decode("utf-8"), zlist[i]
            wb.save(xls_name)
            # if i == (count-1):
            #     mprint("完成本条数据写入")
        else:   #  第2+条数据开始插入
            ws = wb.get_sheet(0)
            ws.write(linenum+1, i, zlist[i])
            print title[i].decode ("utf-8"), zlist[i]
            wb.save(xls_name)

    # mprint(""+str(linenum+1)+"条数据写入成功,还剩"+(sumurlcount-linenum)+"条数据待解析")

class MyThread_totalcom(Thread):
    def __init__(self):
        Thread.__init__(self)

    def run(self):
        # totalcom = totalcomment()
        self.totalcom = totalcomment()

    def get_result(self):
        return self.totalcom

class MyThread_showpic(Thread):
    def __init__(self):
        Thread.__init__(self)

    def run(self):
        self.showpic = getshowpicnum()

    def get_result(self):
        return self.showpic

def getall(url):
    starttime = datetime.datetime.now()
    RETURN_CODE = openweb(url)
    print RETURN_CODE,'RETURN_CODE'


    if RETURN_CODE:#TRUE: skip and warning
        try:
            if RETURN_CODE == 2:
                mprint("页面被跳转")
                skiplist = [url, "!!页面被跳转".decode("utf-8"), RETURN_CODE, "", "", "", "", ""]
                return skiplist
            else:#1
                mprint("无法访问 检查网络是否故障")
                skiplist = [url, "!!检查是否无法打开网页".decode("utf-8"), RETURN_CODE, "", "", "", "", ""]
                return skiplist
        except:
            mprint("???")
            skiplist = [url, "!!跳过该条链接".decode("utf-8"), "???????????????????".decode("utf-8"), "", "", "", "", ""]
            return skiplist

    else:#FALSE :continue to get the data
        # starttime = datetime.datetime.now ()
        endtime = datetime.datetime.now()
        timed = (endtime - starttime).seconds
        mprint("网页已经被打开,耗时:"+str(timed)+"")
        debugprint('scrolldown1')
        #urlcurrent = getcurrenturl()#写一个 如果链接被跳转到其他页面就跳过的判断  有时间再写吧 urlcurrent可能变成 jd.com
        scrolldown()
        # msleep1()
        #scrolldown()
        # msleep2()
        debugprint('scrolldown2')
        name = getname()
        if isOffsale:  # 下柜
            price = "商品已下柜".decode ("utf-8")
        else:
            price = getprice()
        clickcommentbtn()
        # msleep2()
        #好评度能加载完成就能显示晒图
        try:
            print u"好评度:", get_element_bycssselector("#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div").text
        except:
            mprint("无法获取好评度,说明网络加载缓慢")
        #想写个多线程  不过单独一个的时候正常 如果两个都放进去就会出问题 难道是selenium不能同时find两个element?

        mprint("多线程开始")
        thd1 = MyThread_totalcom()

        # thd2 = MyThread_showpic()
        thd1.start()
        mprint("MyThread_totalcom线程开始")
        # thd2.start()
        # mprint("MyThread_showpic程开始")
        thd1.join()
        # thd2.join()
        totalcom = thd1.get_result()
        # showpic = thd2.get_result()
        mprint("多线程结束")

        # totalcom = totalcomment()#上面用多线程这里就注释掉
        showpic = getshowpicnum()
        #上面多线程 只能跑一个 totalcomment和getshowpicnum一起就出问题 好像不是我多线程代码有问题 是selenium不能同时find多个元素
        positivcom = getpositivecomment()
        modertcom = getmoderatecomment()
        negtivcom = getnegativecomment()
        # addcomment = getaddcomment()

        sumlist = [url, name, price, showpic, positivcom, modertcom, negtivcom, totalcom]
        # sumlist = [url, name, price, showpic, positivcom, modertcom, negtivcom ,addcomment]
        # print sumlist
        return sumlist # a list

if __name__ == '__main__':
    try:#__main__
        # print type(now_time), type("时间")
        print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        cc = 0
        # URLSource

        total_starttime = datetime.datetime.now()
        f = open(URLSource, "r")
        lines = f.readlines()  # 读取全部内容
        global sumurlcount
        sumurlcount = len(lines)
        print sumurlcount
        mprint("一共 "+str(sumurlcount)+" 条数据要爬虫")
        for jdurl in lines:
        #for i in urllist:
            s = []
            print jdurl
            one_starttime = datetime.datetime.now ()
            goodsinfo_list = getall(jdurl.replace("\n", ""))
            print "test111111111"
            # print goodsinfo_list
            mwrite(cc, goodsinfo_list)
            oneurl_endtime = datetime.datetime.now ()
            oneurl_timed = (oneurl_endtime - one_starttime).seconds
            mprint ("该条数据写入完成耗时:" + str (oneurl_timed) + "秒,还剩"+str(sumurlcount - cc - 1)+"条数据待分析,即将开始下一个链接的抓取!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            cc = cc + 1


        mprint("@@@@@[email protected]@@@@            所有代码正常运行 无报错          @@@@@@@@@@@[email protected]@@@@@@@@@@@@@@@")
        total_endtime = datetime.datetime.now ()
        total_timed = (total_endtime - total_starttime).seconds
        mprint ("整个爬虫一共耗时:" + str (total_timed) + ""+",单条链接平均爬虫耗时:"+str((round(total_timed/sumurlcount,2)))+ "")

    except Exception, e:
        print Exception, e

        mprint("~~~~~~~~中间有 报错了@@@@@@@@@@@@@@@@")
    finally:
        mprint("sleep 10s后关闭浏览器")
        time.sleep(10)
        webdriver_chrome.quit()

 

猜你喜欢

转载自www.cnblogs.com/hanxing/p/8919962.html