Python利用scrapy框架,爬取大众点评部分商铺数据~

分享一下,自己从0开始,用python爬取数据的历程。希望可以可以帮到一起从0开始的小伙伴~~加油。


首先,我的开发环境是:

电脑:macOS Sierra 10.12.6 编译器:PyCharm + 终端

我的电脑自带的Python版本为2.7,我下载了一个Python3.6。使用3.6版本的来进行本次的编写, 将新下载的Python配置到环境变量里。一般他会自带pip。打开终端,cd到pip所在目录,终端输入 pip scrapy

打开终端,cd到你想创建的项目目录下,终端输入 scrapy startproject Test 


就会在该目录下自动生成一些文件,接下来只要修改其中的一些文件就可以了。

使用PyCharm打开,先截图一下目录结构:


扫描二维码关注公众号,回复: 1995065 查看本文章

根目录就是你创建的项目名,然后会有一个spiders文件夹,里面会有__init__.py 

根目录下的文件,__init__.py , items.py  , middlewares.py , pipelines.py , settings.py

cd到Test目录下,终端输入 scrapy genspider ShopSpider "dianping.com"

会在Test目录下生成一个ShopSpider.py文件。


文件都创建好了。去想要爬的网站看一下它源码的标签结构。

根据想要爬的数据,修改items.py文件

import scrapy


class TestItem(scrapy.Item):
    # 餐馆名
    shop_name = scrapy.Field()
    # 首页图
    shop_img = scrapy.Field()
    # 评星
    shop_star = scrapy.Field()
    # 评价人数
    shop_evaluation = scrapy.Field()
    # 人均价位
    shop_price = scrapy.Field()
    # 菜系
    shop_type = scrapy.Field()
    # 地址1
    shop_address1 = scrapy.Field()
    # 详细地址
    shop_address2 = scrapy.Field()
    # 推荐菜1
    shop_food1 = scrapy.Field()
    # 推荐菜2
    shop_food2 = scrapy.Field()
    # 推荐菜3
    shop_food3 = scrapy.Field()
    # 口味评分
    shop_sweet = scrapy.Field()
    # 环境评分
    shop_environment = scrapy.Field()
    # 服务评分
    shop_server = scrapy.Field()

修改爬虫文件ShopSpider.py

# -*- coding: utf-8 -*-
import scrapy
from Test.items import TestItem


class ShopSpider(scrapy.Spider):
    """
     功能:大众点评沈阳美食店铺数据
    """
    # 爬虫名
    name = 'ShopSpider'
    # 作用范围
    allowed_domains = ['dianping.com']
    # baseurl
    url = 'http://www.dianping.com/shenyang/ch10/g2714p'
    offset = 1
    # 爬取的url
    start_urls = [url + str(offset)]

    def parse(self, response):
        for each in response.xpath("//div[@class='shop-list J_shop-list shop-all-list']/ul/li"):
            # 初始化模型对象≤
            item = TencentItem()
            item['shop_name'] = each.xpath(".//img/@title").extract()[0]

            # 分割图片url
            imgorl = each.xpath(".//img/@src").extract()[0]
            img = imgorl.split('%')[0]
            item['shop_img'] = img

            item['shop_star'] = each.xpath(".//div[@class='comment']/span/@title").extract()[0]

            # 评价人数和平均价格 通过循环次数去找到两个相同的标签下的数据
            price_tag = 0
            for price in each.xpath(".//div[@class='comment']"):
                for p in price.xpath(".//a/b/text()"):
                    if price_tag == 0:
                        # 当评价人数为空的时候,第一个获得到的数据包含'¥'那么就是价格,否则是评价人数
                        ep = price.xpath(".//a/b/text()").extract()[0]
                        if '¥' in ep:
                            item['shop_price'] = ep
                        else:
                            item['shop_evaluation'] = ep
                        price_tag += 1
                    elif price_tag == 1:
                        item['shop_price'] = price.xpath(".//a/b/text()").extract()[1]
                        price_tag += 1

            # 商店类型 和 地址,防止地址1不存在,需要判断
            at_tag = 0
            for at in  each.xpath(".//div[@class='tag-addr']"):
                for att in at.xpath(".//a/span[@class='tag']/text()"):
                    if at_tag == 0:
                        item['shop_type'] = at.xpath(".//a/span[@class='tag']/text()").extract()[0]
                        at_tag += 1
                    elif at_tag == 1:
                        item['shop_address1'] = at.xpath(".//a/span[@class='tag']/text()").extract()[1]
                        at_tag += 1

            # 地址2
            item['shop_address2'] = each.xpath(".//div[@class='tag-addr']/span[@class='addr']/text()").extract()[0]

            # 推荐菜 判断个数
            food_tag = 0
            for food in each.xpath(".//div[@class='recommend']"):
                for f in food.xpath(".//a/text()"):
                    if food_tag == 0:
                        item['shop_food1'] = food.xpath(".//a/text()").extract()[0]
                        food_tag += 1
                    elif food_tag == 1:
                        item['shop_food2'] = food.xpath(".//a/text()").extract()[1]
                        food_tag += 1
                    elif food_tag == 2:
                        item['shop_food3'] = food.xpath(".//a/text()").extract()[2]
                        food_tag += 1
            # 其他评分
            score_tag = 0
            for score in each.xpath(".//span[@class='comment-list']"):
                for s in score.xpath(".//span/b/text()"):
                    if score_tag == 0:
                        item['shop_sweet'] = score.xpath(".//span/b/text()").extract()[0]
                        score_tag += 1
                    elif score_tag == 1:
                        item['shop_environment'] = score.xpath(".//span/b/text()").extract()[1]
                        score_tag += 1
                    elif score_tag == 2:
                        item['shop_server'] = score.xpath(".//span/b/text()").extract()[2]
                        score_tag += 1

            yield item

        if self.offset < 50:
            self.offset += 1
        #
        # # 每次处理完一页的数据之后,重新发送下一页页面请求
        # # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)

其中遇到了一些问题,都是通过百度一点点补全的~写了主要的注释。

修改pipelines.py

import json


class TestPipeline(object):
    """
        功能:保存item数据
    """
    def __init__(self):
        # 打开文件
        self.filename = open("shuiguoshengxian.json", "w")

    def process_item(self, item, spider):
        # 将获取到的每条item转换为json格式
        text = json.dumps(dict(item), ensure_ascii=False) + ",\n"
        self.filename.write(text)
        return item

    def close_spider(self, spider):
        # 关闭文件
        self.filename.close()

__init__方法中的文件名就是你要输出的json文件名。

修改setting.py文件

DEFAULT_REQUEST_HEADERS = {
  # 'User-Agent': "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
}

'''
    伪造一个用户信息,防止403
'''
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
ITEM_PIPELINES = {
   'Tencent.pipelines.TencentPipeline': 300,
}

'''
    防止403崩溃。
'''
HTTPERROR_ALLOWED_CODES = [403]

要注意的就是USER_AGENT的设置。防止拒绝访问403错误。

终端输入 scrapy crawl ShopSpider

爬取成功,就可以看到一个.json文件了,打开就可以看到其中爬到的数据。

类似于:

{"shop_name": "张福光九九草莓采摘园", "shop_img": "http://p0.meituan.net/deal/cbb3476245a7a22becae0835e072a031325900.png", "shop_star": "五星商户", "shop_evaluation": "22", "shop_price": "¥122", "shop_type": "水果生鲜", "shop_address1": "苏家屯区", "shop_address2": "来胜村", "shop_sweet": "9.1", "shop_environment": "9.1", "shop_server": "9.2"},
{"shop_name": "糖糖水果捞", "shop_img": "http://p0.meituan.net/waimaipoi/cc0c567369d52a43f9607a8f2734ad7033647.jpg", "shop_star": "准五星商户", "shop_evaluation": "13", "shop_price": "¥22", "shop_type": "水果生鲜", "shop_address1": "和平区", "shop_address2": "南京南街228-36号6门", "shop_sweet": "8.7", "shop_environment": "8.7", "shop_server": "8.7"},
{"shop_name": "奉鲜果切水果捞(浑南店)", "shop_img": "http://p0.meituan.net/deal/571c8808dead876be5b84a640128b12297393.jpg", "shop_star": "四星商户", "shop_evaluation": "11", "shop_type": "水果生鲜", "shop_address1": "浑南区", "shop_address2": "浑南新区夹河街A-20号10门", "shop_sweet": "7.9", "shop_environment": "7.9", "shop_server": "8.0"},

写一个创建数据库表的py,准备将爬到的数据存在数据库里。

# -*- coding: utf-8 -*-

import pymysql

serverIp = "数据库ip地址"
userName = "登录用户名"
password = "登录密码"
databaseName = "数据库名"

# 打开数据库连接
db = pymysql.connect(serverIp, userName, password, databaseName)

# 使用cursor()方法创建一个游标对象cursor
cursor = db.cursor()

# 创建表语句 注意长度限制
sql = """CREATE TABLE shuiguoshengxian (
         shop_id  INT PRIMARY KEY auto_increment,
         shop_name  VARCHAR(50),
         shop_img VARCHAR(150),  
         shop_star VARCHAR(10),
         shop_evaluation INT,
       shop_price INT,
       shop_type VARCHAR(10),
       shop_address1 VARCHAR(15),
       shop_address2 VARCHAR(50),
       shop_food1 VARCHAR(20), 
       shop_food2 VARCHAR(20), 
       shop_food3 VARCHAR(20), 
       shop_sweet FLOAT, 
       shop_environment FLOAT, 
       shop_server FLOAT)"""

# 使用execute()方法执行SQL查询
cursor.execute(sql)

# 使用 fetchone() 方法获取单条数据.
# data = cursor.fetchone()

# print("Database version : %s " % data)
cursor.close()

# 关闭数据库连接
db.close()

写一个上传json数据到数据库的py

# -*- coding: utf-8 -*-

import json
import pymysql

serverIp = "数据库ip地址"
userName = "登录用户名"
password = "登录密码"
databaseName = "数据库名"

# 打开数据库连接  注意最后一个参数charset='utf8'
db = pymysql.connect(host=serverIp, user=userName, passwd=password, db=databaseName, port=3306, charset="utf8")

# 使用cursor()方法创建一个游标对象cursor
cursor = db.cursor()

data = []
with open('shuiguoshengxian.json') as f:
    for line in f:
        # 需要数据为json格式,所以去掉每行末尾的','
        data.append(json.loads(line[0:-2]))


for item in data:
    # 使用get方法如果对应key没有值,则赋一个默认值

    # 防止字符串中包含单引号
    shop_name_str = item.get('shop_name', "").replace("'", "\\\'")
    shop_img_str = item.get('shop_img', '')
    shop_star_str = item.get('shop_star', '')
    shop_evaluation_str = item.get('shop_evaluation', 0)
    shop_price_stro = item.get('shop_price', '0')

    if shop_price_stro != '0':
        # 将前面的'¥'过滤掉
        shop_price_str = shop_price_stro[1:]
    else:
        shop_price_str = 0
    shop_type_str = item.get('shop_type', '')
    shop_address1_str = item.get('shop_address1', '')
    shop_address2_str = item.get('shop_address2', '')
    shop_food1_str = item.get('shop_food1', '')
    shop_food2_str = item.get('shop_food2', '')
    shop_food3_str = item.get('shop_food3', '')
    shop_sweet_str = item.get('shop_sweet', 0.0)
    shop_environment_str = item.get('shop_environment', 0.0)
    shop_server_str = item.get('shop_server', 0.0)

    str = "INSERT INTO shuiguoshengxian(shop_name, shop_img, shop_star, shop_evaluation, shop_price, shop_type, shop_address1, shop_address2, shop_food1, shop_food2, shop_food3, shop_sweet, shop_environment, shop_server) VALUES "
    str = str + "('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');\r\n" % (shop_name_str, shop_img_str, shop_star_str, shop_evaluation_str, shop_price_str, shop_type_str, shop_address1_str, shop_address2_str, shop_food1_str, shop_food2_str, shop_food3_str, shop_sweet_str, shop_environment_str, shop_server_str)
    # str = "UPDATE shops SET shop_price = '%s' WHERE shop_name = '%s';" % (shop_price_str, shop_name_str)
    cursor.execute(str)

f.close()
cursor.close()
db.commit()
db.close()

print("success")

注意,要把json文件放在项目根目录下,因为

with open('shuiguoshengxian.json') as f:

如果在别的路径,可以填具体路径。

OK,整个流程就是这样。

说了一通,并不详细,如果新人看到了,可能有很多疑问,欢迎提问,我会的都会解答的,。也欢迎大神来批评~~写的代码虽然实现了想要的功能效果,指定漏洞百出,希望得到批评指点,谢谢。

~~~与君共勉。

猜你喜欢

转载自blog.csdn.net/qq_23057645/article/details/80846077