# 搜索京东商品 使用动态数据抓取 import requests from bs4 import BeautifulSoup import re class JD: def __init__(self): self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} ##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦) def req(self,url): # 解析京东搜索首页地址 soup1=self.requests_utf(url) allcount=soup1.find('span',id='J_resCount').get_text() print('共有%s件商品'%allcount) print('**************************************************************************') # 查询搜索的商品总页数 page=int(soup1.find('span',class_='fp-text').i.get_text()) for i in range(1,page*2,2): url_star=url[:-2]+str(i) # 根据商品页数解析搜索地址 soup=self.requests_utf(url_star) # 定位商品信息 li_all = soup.find_all('li', class_='gl-item') for i in li_all: # 商品标题 title = i.a['title'] # 商品实际地址 href = i.a['href'] if href[:4]=='http': pass else: href='https:'+ href # 价格 price = float(i.i.get_text()) # 解析商品实际地址 soup_href=self.requests_gbk(href) real_href=soup_href.find('link',rel="canonical")['href'] real_href='https:'+real_href # 定位商品名称并去空格 sku_name=soup_href.find('div',class_='sku-name') product_name=str(sku_name.get_text()).strip() # 产品Id search_ID=re.search('\d+',real_href) product_ID=search_ID.group() # 解析产品评价js,返回数据 summary=self.product_summary(product_ID) goodRateShow=''.join(summary[0]) goodCountStr=''.join(summary[1]) if price <= 10000: print('商品ID:',product_ID) print('商品:',product_name) print('商品标题:',title) print('好评累计:',goodCountStr) print('好评率:',goodRateShow +'%') print('链接地址:',real_href) print('价格:',price) print('*************************************************************************') else: continue # print('可用优惠券:',quan) # print('优惠券信息:',quan_item) # 产品评价 def product_summary(self,product_ID): url='https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv5292&productId='+product_ID+'&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1' pro_req=self.requests_gbk(url) # 好评率 pattern0 = re.compile('goodRateShow":(.*?),"poorRateShow') # 好评累计 pattern1 = re.compile('"goodCountStr":"(.*?)",') # 部分评价 pattern_ping0=re.compile('("content":"(.*?){2}","creationTime)') search0 = re.findall(pattern0, str(pro_req)) search1 = re.findall(pattern1, str(pro_req)) return search0,search1 # 解析网页utf-8 def requests_utf(self,url): try: content = requests.get(url, headers=self.headers) content.encoding = 'utf-8' soup = BeautifulSoup(content.text, 'lxml') return soup except: print('网页解析发生错误!!!!') # 解析网页gbk def requests_gbk(self,url): try: content = requests.get(url, headers=self.headers) content.encoding = 'gbk' soup = BeautifulSoup(content.text, 'lxml') return soup except: print('网页解析发生错误!!!!') jd=JD() search_word='女士长款钱包' url='https://search.jd.com/Search?keyword=' + search_word +'&enc=utf-8' jd.req(url)
python爬虫-京东全网搜索
猜你喜欢
转载自blog.csdn.net/zou407479250/article/details/80251609
今日推荐
周排行