【Python】打响2019年第二炮-Python爬虫入门（二）

打响2019第二炮-Python爬虫入门

在2019年第一炮文章中获取到了京东商城某一台电脑的列表信息，并保存到CSV能够更方便的查看如下：
在这里插入图片描述
本章内容主要解决，如何多页获取手机&电脑数据，获取评价以及好评率等信息，实现效果如下：

如何获取评论信息？

在京东页面搜索手机或者电脑，随后按f12或者ctrl+shift+i 调用开发者工具，在英文中评论为comment,所以我们可以尝试开发者工具页面搜索comment
在这里插入图片描述
打开此页面

通过搜索comment Response返回结果来看，可以看出{} json格式，这时候就可以尝试获取json

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import json
import requests
from bs4 import BeautifulSoup


def download(url, headers, num_retries=3):
    print("download", url)
    try:
        response = requests.get(url, headers=headers)
        print(response.status_code)
        if response.status_code == 200:
            return response.content
        return None

    except RequestException as e:
        print(e.response)
        html = ""
        if hasattr(e.response, 'status_code'):
            code = e.response.status_code
            print('error code', code)
            if num_retries > 0 and 500 <= code < 600:
                html = download(url, headers, num_retries - 1)
        else:
            code = None
    return html


def get_json():
    jd_html = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=8674557,100000769466,8443496,100000769432,100000117782,100000679465,100000863175,100000612187,8461498,8461490,8461496,7765111,100001045546,7999189,100000667974,100001045648,6072622,100000644947,100002470752,8484118,7690501,7621213,8596169,100000863245,100001045514,100001269968,100001692089,100000863247,100000400472,100001521818&callback=jQuery9848036&_=1546399791459"

    headers = {
    'User-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
    "referer": "https://www.jd.com"
    }

    get = download(jd_html, headers=headers)
    print(get)

if __name__ == "__main__":
    get_json()

以下为输出数据
在这里插入图片描述
获取json数据时的url部分如下：

从获取到的json来看，每一段都代表着某一台电脑的评价及好评率等信息，也就是每台商品的ID 如下：

如果需要取每一台商品的评论及好评率，目前得知可以在以下链接末尾加上某个商品的ID号，获取json数据，进行评价及好评率分析从而获得想要的内容

在这里插入图片描述

def find_Computer(url, headers):
    r = download(url, headers=headers)
    page = BeautifulSoup(r, "lxml")
    all_items = page.find_all('li', attrs={'class' : 'gl-item'})

    with open("Computer.csv", 'w', newline='') as f:
        writer = csv.writer(f)
        fields = ('ID', '名称', '价格', '评论数', '好评率')
        writer.writerow(fields)

        for all in all_items:
            # 取每台电脑的ID
            Computer_id = all["data-sku"]
            print(f"电脑ID为：{Computer_id}")

            # 取每台电脑的名称
            Computer_name = all.find('div', attrs={'class':'p-name p-name-type-2'}).find('em').text
            print(f"电脑的名称为：{Computer_name}")

            # 取每台电脑的价格
            Computer_price = all.find('div', attrs={'class':'p-price'}).find('i').text
            print(f"电脑的价格为：{Computer_price}元")

            # 取每台电脑的Json数据(包含 评价等等信息)
            Comment = f"https://club.jd.com/comment/productCommentSummaries.action?referenceIds={Computer_id}"
            comment_count, good_rate = get_json(Comment)
            print('评价人数：', comment_count)
            print('好评率：', good_rate)

            row = []
            row.append(Computer_id)
            row.append(Computer_name)
            row.append(str(Computer_price) + "元")
            row.append(comment_count)
            row.append(good_rate)
            writer.writerow(row)

获取每台电脑商品的json数据

def get_json(url):
    data = requests.get(url).json()
    result = data['CommentsCount']
    for i in result:
        return i["CommentCountStr"], i["GoodRateShow"]

代码如下：

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import requests
import csv
from requests.exceptions import RequestException
from bs4 import BeautifulSoup


def download(url, headers, num_retries=3):
    print("download", url)
    try:
        response = requests.get(url, headers=headers)
        print(response.status_code)
        if response.status_code == 200:
            return response.content
        return None

    except RequestException as e:
        print(e.response)
        html = ""
        if hasattr(e.response, 'status_code'):
            code = e.response.status_code
            print('error code', code)
            if num_retries > 0 and 500 <= code < 600:
                html = download(url, headers, num_retries - 1)
        else:
            code = None
    return html


def find_Computer(url, headers):
    r = download(url, headers=headers)
    page = BeautifulSoup(r, "lxml")
    all_items = page.find_all('li', attrs={'class' : 'gl-item'})

    with open("Computer.csv", 'w', newline='') as f:
        writer = csv.writer(f)
        fields = ('ID', '名称', '价格', '评论数', '好评率')
        writer.writerow(fields)

        for all in all_items:
            # 取每台电脑的ID
            Computer_id = all["data-sku"]
            print(f"电脑ID为：{Computer_id}")

            # 取每台电脑的名称
            Computer_name = all.find('div', attrs={'class':'p-name p-name-type-2'}).find('em').text
            print(f"电脑的名称为：{Computer_name}")

            # 取每台电脑的价格
            Computer_price = all.find('div', attrs={'class':'p-price'}).find('i').text
            print(f"电脑的价格为：{Computer_price}元")

            # 取每台电脑的Json数据(包含 评价等等信息)
            Comment = f"https://club.jd.com/comment/productCommentSummaries.action?referenceIds={Computer_id}"
            comment_count, good_rate = get_json(Comment)
            print('评价人数：', comment_count)
            print('好评率：', good_rate)

            row = []
            row.append(Computer_id)
            row.append(Computer_name)
            row.append(str(Computer_price) + "元")
            row.append(comment_count)
            row.append(good_rate)
            writer.writerow(row)


def get_json(url):
    data = requests.get(url).json()
    result = data['CommentsCount']
    for i in result:
        return i["CommentCountStr"], i["GoodRateShow"]


def main():
    headers = {
        'User-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
        "referer": "https://passport.jd.com"
    }
    URL = "https://search.jd.com/Search?keyword=%E7%94%B5%E8%84%91&enc=utf-8&wq=%E7%94%B5%E8%84%91&pvid=1ff18312e8ef48febe71a66631674848"

    find_Computer(URL, headers=headers)

if __name__ == '__main__':
    main()