pymongo.errors.DuplicateKeyError: E11000 duplicate key error collection: anjuke.ershoufang index

这个bug忙了我一下午加一个晚上,终于把它ko掉了

先附上一段爬取安居客二手房信息的代码

import re
import time
import pymongo
import requests
from bson import ObjectId
from lxml import etree
from pprint import pprint
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
    "cookie": "aQQ_ajkguid=243E5D58-8B13-D7BD-4922-3DE583E03855; ctid=11; _ga=GA1.2.1030980732.1530799904; _gid=GA1.2.506397644.1530799904; 58tj_uuid=c606f59a-2fb9-4c91-9815-741fdf9cfe5d; als=0; lps=http%3A%2F%2Fwww.anjuke.com%2F%3Fpi%3DPZ-baidu-pc-all-biaoti%7Chttps%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D1%26tn%3Dbaidu%26wd%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26rsv_pq%3Dd71198bd000395ca%26rsv_t%3D6172VDlcx2zzRQ%252FLyCdcEidtafr%252BSvVyVXrlZ0lsK3U1MEz8066IF4byz4c%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D5%26rsv_sug1%3D5%26rsv_sug7%3D101; twe=2; sessid=3497C1D2-43A8-6143-B2D7-CFDA33FF0C0E; new_uv=2; __xsptplus8=8.2.1530840314.1530840335.2%232%7Cwww.baidu.com%7C%7C%7C%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%7C%23%23Z7v3XnqLDcxTHeMLiqLXQSLHvXrh8k_R%23",
    "referer": "https://shanghai.anjuke.com/?pi=PZ-baidu-pc-all-biaoti"
}

# 连接数据库
client = pymongo.MongoClient('127.0.0.1', 27017)
# 定义数据库名称
db = client.anjuke
# 定义表名
coll = db.ershoufang

def get_info():
    count = 0
    for i in range(23):

        response = requests.get('https://shanghai.anjuke.com/sale/p{}/#filtersort'.format(i), headers=headers)

        item = response.text

        # print(item)
        # 利用etree.HTML,将字符串解析为HTML文档
        html = etree.HTML(item)
        htmls = html.xpath('//*[@id="houselist-mod-new"]/li')
        # print(htmls)


         house = {}
        for h in htmls:
            h_addr = h.xpath('./div[2]/div[1]/a/text()')[0].strip()
            h_type = h.xpath('./div[2]/div[2]/span[1]/text()')[0].strip()
            h_area = h.xpath('./div[2]/div[2]/span[2]/text()')[0].strip()
            h_hight = h.xpath('./div[2]/div[2]/span[3]/text()')[0].strip()
            h_name = h.xpath('./div[2]/div[2]/span[4]/text()')[0].strip()
            try:
                h_youshi1 = h.xpath('./div[2]/div[4]/span[1]/text()')[0].strip()
            except:
                h_youshi1=None
            try:
                h_youshi2 = h.xpath('./div[2]/div[4]/span[2]/text()')[0].strip()
            except:
                h_youshi2=None
            try:
                h_youshi3 = h.xpath('./div[2]/div[4]/span[3]/text()')[0].strip()
            except:
                h_youshi3=None
            h_price = h.xpath('./div[3]/span[1]/strong/text()')[0].strip()
           
            house['h_addr']=h_addr
            house['h_type']=h_type
            house['h_area']=h_area
            house['h_hight']=h_hight
            house['h_name']=h_name
            house['h_youshi1']=h_youshi1
            house['h_youshi2']=h_youshi2
            house['h_youshi3']=h_youshi3
            house['h_price']=h_price
            # pprint(house)
            time.sleep(0.01)
           
            # coll.insert(house)
            save(house)
            count+=1
    print(count)

def save(house):

    coll.insert(house)

def main():
    get_info()

if __name__ == '__main__':

    main()

这个代码只能运行两条数据,


两条数据,一条有‘_id’,一条没有

目前有两条解决方案:

一:在程序中加一个‘_id’,这个字段,自己设置_id字段,代替系统分配:


程序没问题:


二:将house={},这个字典放在for循环里面:


这两种方法都可以解决问题,个人建议还是方法二,代码规范,让系统自己分配id

猜你喜欢

转载自blog.csdn.net/master_ning/article/details/80947107
今日推荐