爬取京东数据

版权声明:本文为博主原创文章,转载请附上出处。 https://blog.csdn.net/k_runtu/article/details/83592015
import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
# from config import *
import pymongo
import json
import time
from multiprocessing import Pool, Process
from threading import Thread

# MONGO_URL = 'localhost'
# MONGO_DB = 'taobao'
# MONGO_TABLE = 'product'

SERVICE_ARGS = ['--load-images=false', '--disk-cache=true','--ignore-ssl-errors=true', '--ssl-protocol=TLSv1']

# KEYWORD = '美食'

# client = pymongo.MongoClient(MONGO_URL)
# db = client[MONGO_DB]

browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
# browser.implicitly_wait(15)
# browser = webdriver.PhantomJS()
wait = WebDriverWait(browser, 15)

browser.set_window_size(1400, 3000)

x = 1

class Rate:
    def __init__(self):
        SERVICE_ARGS = ['--load-images=false', '--disk-cache=true','--ignore-ssl-errors=true', '--ssl-protocol=TLSv1']
        self.browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
        self.wait = WebDriverWait(self.browser, 15)
    def get_good_rate(self, url):
        if len(url) > 52:
            return -1
        self.browser.get(url)
        doc = pq(self.browser.page_source)
        if not (doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div')):
            print('no')
            browser.execute_script("window.scrollBy(0,6000)")
            time.sleep(2)
        else:
            print('yes')
            return doc('#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div').text()
            
        rate = self.wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div'))
            )
        return rate.text

r = Rate()


def search():
    global x
    global r
    print('正在搜索')
    try:
        browser.get('https://www.jd.com')
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#key'))
        )
        print('input')
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#search > div > div.form > button')))
        # submit = wait.until(
        #     EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.s')))
        input.send_keys('空气净化器')
        submit.click()
        wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(30)'))
        )
        print('..')
        doc = pq(browser.page_source)
        # with open('page.txt', 'w', encoding='utf-8') as f:
        #     f.write(doc.text())
        items = doc('.gl-item')
        print(len(items))
        data = []
        for rank, item in enumerate(items):
            item = pq(item)
            print(x)
            product = {
                'rank': x,
                'price': item('.p-price i').text(),
                'title': item('.p-name em').text(),
                'comment_cnt': item('.p-commit>strong a').text(),
                'comment_url': 'https:' + item('.p-commit>strong a').attr.href
            }
            product['brand'] = product['title'].split('\n')[0]
            good_rate = r.get_good_rate(product['comment_url'])
            product['good_rate'] = good_rate
            data.append(product)
            x += 1
        with open('data.txt', 'a', encoding='utf-8') as f:
            f.write(json.dumps(data, indent=2, ensure_ascii=False))
        
        # for i in range(2,101):
        #     next_page(i)
    except TimeoutException as e:
       return False


def next_page(page_number):
    global x
    global r
    print('正在翻页', page_number)
    try:
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input'))
        )
        submit = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(30)'))
        )
        print('..')
        # browser.execute_script("window.scrollBy(0,10000)")
        # time.sleep(2)
        # wait.until(
        #     EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList > ul > li:nth-child(60)'))
        # )
        doc = pq(browser.page_source)
        items = doc('.gl-item')
        print(len(items))
        data = []
        for rank, item in enumerate(items):
            item = pq(item)
            print(x)
            product = {
                'rank': x,
                'price': item('.p-price i').text(),
                'title': item('.p-name em').text(),
                'comment_cnt': item('.p-commit>strong a').text(),
                'comment_url': 'https:' + item('.p-commit>strong a').attr.href
            }
            product['brand'] = product['title'].split('\n')[0]
            good_rate = r.get_good_rate(product['comment_url'])
            product['good_rate'] = good_rate
            data.append(product)
            x += 1
        with open('data.txt', 'a', encoding='utf-8') as f:
            f.write(json.dumps(data, indent=2, ensure_ascii=False))
    except Exception as e:
        print(e)
        next_page(page_number)


# def save_to_mongo(result):
#     try:
#         if db[MONGO_TABLE].insert(result):
#             print('存储到MONGODB成功', result)
#     except Exception:
#         print('存储到MONGODB失败', result)


def main():      
    try:
        total = search()
        total = int(re.compile(r'(\d+)').search(total).group(1))
        for i in range(2, total + 1):
            next_page(i)
    except Exception as e:
        print('出错啦')
        print(e)
    finally:
        browser.close()

if __name__ == '__main__':
    # main()
    search()
    for i in range(2, 5):
        # time.sleep(1)
        t = Thread(target=next_page, args=(i,))
        t.start()
        t.join()
        # next_page(i)
        # p = Process(target=next_page, args=(i,))
        # p.start()
        # p.join()
    # pool = Pool()
    # pool.map(next_page, [i for i in range(2,101)])
    # pool.close()
    # pool.join()

猜你喜欢

转载自blog.csdn.net/k_runtu/article/details/83592015