Practiced hand of a crawling electricity supplier information IPHONE

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2019/12/30 10:51
# @Site    : 
# @File    : jd_iphone.py
# @Software: PyCharm

import json
import time
import urllib3
import logging
import requests
from pyquery import PyQuery
from selenium import webdriver

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning
headers = {
        "Referer": "https://search.jd.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    }
#控制台日志输出
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger_page = logging.getLogger("jd_iphone_page")
logger_detail = logging.getLogger("jd_iphone_detail")

DEF get_page_detail (maxp): 
    product_list = [] 
    p_no = 0
     for Page in Range (maxp * 1,2, 2 ): 
        URL = ' https://search.jd.com/Search?keyword=iphoneapple&page= ' + STR ( Page) + ' & the Click = 0 ' # screened iPhone, link 
        # RESP = requests.get (url, headers = headers, the Verify = False) 
        # browser windowless mode 
        chrome_options = webdriver.ChromeOptions () 
        chrome_options.add_argument ( ' - headless ' ) 
        Driver= webdriver.Chrome(chrome_options=chrome_options)
        # driver = webdriver.Chrome()
        driver.get(url)
        driver.implicitly_wait(10)
        #执行js滚动条
        js = '''
        timer = setInterval(function(){
           var scrollTop=document.documentElement.scrollTop||document.body.scrollTop;
           var ispeed=Math.floor(document.body.scrollHeight / 100);
           if(scrollTop > document.body.scrollHeight * 90 / 100){
               clearInterval(timer);
           }
           console.log('scrollTop:'+scrollTop)
           the console.log ( 'the scrollHeight:' + document.body.scrollHeight) 
           the window.scrollTo (0, + scrollTop the ispeed) 
        }, 20 is) 
        '' ' 
        driver.execute_script (JS) 
        the time.sleep ( . 5 ) 
        HTML = driver.page_source 
        DOC pyquery = (html, = Parser " html " ) # no parameter parser = "html" can not be resolved, pyquery html parsing the string type, but the above type is XHTML 
        logger_page.info ( " Getting data page% s ...... " % ((+ Page. 1) / 2 ))
         for Item in DOC ( " #J_goodsList Li ") .items (): 
            tmp_list =  []
            KEY_LIST = [] # authentication key, without removing the data information iphone 
            # obtain information 
            data_sku = item.attr ( ' Data-SKU ' ) 
            para = " .J_% S " % data_sku 
            . price = item.find (para) .text ()
             # Get store 
            shop item.find = ( ' .J_im_icon ' ) .text ()
             # tab 
            tag_p = " # J_pro_% S " % data_sku
            Tag = item.find (tag_p) .text ()
             # Product link 
            Item = item.find ( " .gl wrap div-I-A " )
             for font in item.find ( ' font ' ) .items (): 
                KEY_LIST. the append (font.text ()) 
            href = item.attr ( ' href ' )
             # determine whether the Apple phone 
            iF (U ' apple ' in key_list) or (U ' the iPhone ' in key_list) or (U ' the Apple ' in key_list)or(u'apple'in key_list):
                if "http:" in href:
                    href = href[5:]
                    tmp_list.append(href)
                    tmp_list.append(price)
                    tmp_list.append(shop)
                    tmp_list.append(tag)
                    product_list.append(tmp_list)
                else:
                    tmp_list.append(href)
                    tmp_list.append(price)
                    tmp_list.append(shop)
                    tmp_list.append (Tag)
                    product_list.append(tmp_list)
                p_no Link [0] + =. 1 
                logger_page.info ( ' Getting page% s,% s of products ...... ' % (((Page +. 1) / 2 ), p_no))
             the else :
                 Continue 
    # Print (product_list ) 
    # Print (len (product_list)) 
    return product_list 

DEF product_detail (List): 
    NO = 0 
    Product_Info = []
     for Link in List: 
        URL = ' HTTP: ' + 
        logger_detail.info ( " Getting% s of pieces of information .. ...."%(no+1))
        no+=1
        detail_html = requests.get(url,verify=False)
        doc = PyQuery(detail_html.text,parser="html")

        product_dic = {
            "title":doc(".itemInfo-wrap div.sku-name").text(),
            "jd_price":list[no-1][1],
            "shop":list[no-1][2],
            "tag":list[no-1][3],
            "colour":doc("#choose-attr-1 div.item").text(),
            "ram":doc("#choose-attr-2 div.item").text(),
            "style_buy":doc("#choose-attr-3 div.item").text(),
            # "increment":doc("#summary-support div span").text()
        }
        product_info.append(product_dic)
        # print("第%s条iphone信息:"%(no+1))
        # print(json.dumps(product_dic,encoding='UTF-8', ensure_ascii=False))
    return product_info
        # print(json.dumps(product_dic,encoding='UTF-8', ensure_ascii=False))#字典中文输出


if __name__ == '__main__':
    list = get_page_detail(1)
    # print(json.dumps(list,encoding='UTF-8', ensure_ascii=False))
    reasult = product_detail(list)
    print (json.dumps(reasult,encoding='UTF-8', ensure_ascii=False))

Guess you like

Origin www.cnblogs.com/East-fence/p/12129371.html