python scrapy爬取网站数据一

原来写过一篇scrapy的介绍,说了下scrapy的环境如何配置,该篇博客地址是:win10 python安装及环境配置、scrapy框架安装及PyCharm集成
本篇会从一个实际的例子当中记录scrapy的使用
大家都对三国很熟,下面我们从 三国在线(http://www.e3ol.com/biography-index.html)来获取三国人物数据,获取三国人物数据的整体代码如下,本代码抓取数据的网址返回的是JSON格式的数据,本代码将解析该JSON数据,并将其按json的键创建数据表,保存人物信息

import scrapy
import json
import pymysql
import re
from sgyyScrapy.items import SgyyscrapyItem

headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}

class sgyyScrapy(scrapy.Spider):
    name = "sgyyScrapy"
    allowed_domins = ["http://www.e3ol.com/"]
    start_urls = []
    isCreateTable = False

    def start_requests(self):
        global headers
        # 三国在线 通过主效势力去选 完整地址 http://www.e3ol.com/biography/inc_ajax.asp?types=index&a2=%s&pageno=1
        urlhead = 'http://www.e3ol.com/biography/inc_ajax.asp?types=index&a2=%s'

        for i in range(14):
            baseUrl =  urlhead % (i+1) + '&pageno=%s'
            for qy in range(50):
                url = baseUrl % (qy+1)
                self.start_urls.append(url)
        for url in self.start_urls:
            # print (url)
            yield scrapy.Request(url, headers=headers, callback=self.parse)

    def parse(self, response):
        jsonStr = response.body_as_unicode()
        # 返回的数据是unicode编码,中文都被解析成\u4e2d\u6587(中文)这类的字符串了,所以通过下面的方法将其转换成中文
        encodeStr = jsonStr.encode('utf-8').decode('unicode_escape')
        encodeJsonStr = encodeStr[1:len(encodeStr)-1]
        encodeJsonStr = encodeJsonStr.replace(" ","")
        print(encodeJsonStr)
        # 返回的json的key没有‘’,会导致json.loads出错,在此给key添加''
        reEncodeStr = self.quote_keys_for_json(encodeJsonStr)
        print(reEncodeStr)
        # JSON转换成对象
        jsonObject = json.loads(reEncodeStr.replace("'", "\""))

        # 数据库连接
        db = pymysql.connect(host = "127.0.0.1", port = 3306, user = "root",password =  "zhl",database = "sgyy",charset='utf8')
        cursor = db.cursor()

        for item in jsonObject['soul']:
            joi = 0
            jsonObjectNum = len(item)
            createSQL = ""
            insertSQL = ""
            insertSQLValue = ""

            if self.isCreateTable == False:
                for key in item:
                    joi = joi + 1
                    if joi >= jsonObjectNum:
                        createSQL = createSQL + key + " varchar(1000))"
                        insertSQL = insertSQL + key + ")"
                        insertSQLValue = insertSQLValue + "'%s')" % item[key]
                    else:
                        if joi == 1:
                            createSQL = "create table sgyy_person(" + key + " varchar(1000),"
                            insertSQL = "insert into sgyy_person(" + key + ","
                            insertSQLValue = insertSQLValue + " values ('%s'," % item[key]
                        else:
                            createSQL = createSQL + key + " varchar(1000),"
                            insertSQL = insertSQL + "" +key +","
                            insertSQLValue = insertSQLValue +  "'%s'," % item[key]

                try:
                    print(createSQL)
                    cursor.execute("DROP TABLE IF EXISTS sgyy_person")
                    cursor.execute(createSQL)
                    insertFinal = insertSQL+insertSQLValue
                    print(insertFinal)
                    cursor.execute(insertFinal)
                    db.commit()
                except:
                    print("发生错误,回滚事务")
                    db.rollback()
                self.isCreateTable = True
            else:
                for key in item:
                    joi = joi + 1
                    if joi >= jsonObjectNum:
                        insertSQL = insertSQL + key + ")"
                        insertSQLValue = insertSQLValue + "'%s')" % item[key]
                    else:
                        if joi == 1:
                            insertSQL = "insert into sgyy_person(" + key + ","
                            insertSQLValue = insertSQLValue + " values ('%s'," % item[key]
                        else:
                            insertSQL = insertSQL + "" +key +","
                            insertSQLValue = insertSQLValue +  "'%s'," % item[key]

                try:
                    insertFinal = insertSQL + insertSQLValue
                    print(insertFinal)
                    cursor.execute(insertFinal)
                    db.commit()
                except:
                    print("发生错误,回滚事务")
                    db.rollback()
        cursor.close()
        db.close()
        print("结束")

    def quote_keys_for_json(self,json_str):
        # """给键值不带双引号的json字符串的所有键值加上双引号。
        # 注:解析一般的不严格的json串,可以checkout https://github.com/dmeranda/demjson, 速度比标准库要慢。"""
        quote_pat = re.compile(r'".*?"')
        a = quote_pat.findall(json_str)
        json_str = quote_pat.sub('@', json_str)
        key_pat = re.compile(r'(\w+):')
        json_str = key_pat.sub(r'"\1":', json_str)
        assert json_str.count('@') == len(a)
        count = -1
        def put_back_values(match):
            nonlocal count
            count += 1
            return a[count]
        json_str = re.sub('@', put_back_values, json_str)
        return json_str

猜你喜欢

转载自blog.csdn.net/fengshuiyue/article/details/80857875