#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/12/30 10:51 # @Site : # @File : jd_iphone.py # @Software: PyCharm import json import time import urllib3 import logging import requests from pyquery import PyQuery from selenium import webdriver urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)#fidder抓包忽略请求warning headers = { "Referer": "https://search.jd.com/", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", } #控制台日志输出 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger_page = logging.getLogger("jd_iphone_page") logger_detail = logging.getLogger("jd_iphone_detail") DEF get_page_detail (maxp): product_list = [] p_no = 0 for Page in Range (maxp * 1,2, 2 ): URL = ' https://search.jd.com/Search?keyword=iphoneapple&page= ' + STR ( Page) + ' & the Click = 0 ' # screened iPhone, link # RESP = requests.get (url, headers = headers, the Verify = False) # browser windowless mode chrome_options = webdriver.ChromeOptions () chrome_options.add_argument ( ' - headless ' ) Driver= webdriver.Chrome(chrome_options=chrome_options) # driver = webdriver.Chrome() driver.get(url) driver.implicitly_wait(10) #执行js滚动条 js = ''' timer = setInterval(function(){ var scrollTop=document.documentElement.scrollTop||document.body.scrollTop; var ispeed=Math.floor(document.body.scrollHeight / 100); if(scrollTop > document.body.scrollHeight * 90 / 100){ clearInterval(timer); } console.log('scrollTop:'+scrollTop) the console.log ( 'the scrollHeight:' + document.body.scrollHeight) the window.scrollTo (0, + scrollTop the ispeed) }, 20 is) '' ' driver.execute_script (JS) the time.sleep ( . 5 ) HTML = driver.page_source DOC pyquery = (html, = Parser " html " ) # no parameter parser = "html" can not be resolved, pyquery html parsing the string type, but the above type is XHTML logger_page.info ( " Getting data page% s ...... " % ((+ Page. 1) / 2 )) for Item in DOC ( " #J_goodsList Li ") .items (): tmp_list = [] KEY_LIST = [] # authentication key, without removing the data information iphone # obtain information data_sku = item.attr ( ' Data-SKU ' ) para = " .J_% S " % data_sku . price = item.find (para) .text () # Get store shop item.find = ( ' .J_im_icon ' ) .text () # tab tag_p = " # J_pro_% S " % data_sku Tag = item.find (tag_p) .text () # Product link Item = item.find ( " .gl wrap div-I-A " ) for font in item.find ( ' font ' ) .items (): KEY_LIST. the append (font.text ()) href = item.attr ( ' href ' ) # determine whether the Apple phone iF (U ' apple ' in key_list) or (U ' the iPhone ' in key_list) or (U ' the Apple ' in key_list)or(u'apple'in key_list): if "http:" in href: href = href[5:] tmp_list.append(href) tmp_list.append(price) tmp_list.append(shop) tmp_list.append(tag) product_list.append(tmp_list) else: tmp_list.append(href) tmp_list.append(price) tmp_list.append(shop) tmp_list.append (Tag) product_list.append(tmp_list) p_no Link [0] + =. 1 logger_page.info ( ' Getting page% s,% s of products ...... ' % (((Page +. 1) / 2 ), p_no)) the else : Continue # Print (product_list ) # Print (len (product_list)) return product_list DEF product_detail (List): NO = 0 Product_Info = [] for Link in List: URL = ' HTTP: ' + logger_detail.info ( " Getting% s of pieces of information .. ...."%(no+1)) no+=1 detail_html = requests.get(url,verify=False) doc = PyQuery(detail_html.text,parser="html") product_dic = { "title":doc(".itemInfo-wrap div.sku-name").text(), "jd_price":list[no-1][1], "shop":list[no-1][2], "tag":list[no-1][3], "colour":doc("#choose-attr-1 div.item").text(), "ram":doc("#choose-attr-2 div.item").text(), "style_buy":doc("#choose-attr-3 div.item").text(), # "increment":doc("#summary-support div span").text() } product_info.append(product_dic) # print("第%s条iphone信息:"%(no+1)) # print(json.dumps(product_dic,encoding='UTF-8', ensure_ascii=False)) return product_info # print(json.dumps(product_dic,encoding='UTF-8', ensure_ascii=False))#字典中文输出 if __name__ == '__main__': list = get_page_detail(1) # print(json.dumps(list,encoding='UTF-8', ensure_ascii=False)) reasult = product_detail(list) print (json.dumps(reasult,encoding='UTF-8', ensure_ascii=False))
Practiced hand of a crawling electricity supplier information IPHONE
Guess you like
Origin www.cnblogs.com/East-fence/p/12129371.html
Recommended
Ranking