yelp dataset导入Neo4j详解(二)

前序文章讲解了yelp dataset导入Neo4j的详细步骤,但实际操作过程中可能会遇到各种问题。

为了避免中间环节遇到的各类问题,选择直接读取json文件,解析需要的字段导入Neo4j。下文附上详细代码。

注意:

1、字段可能重复,所以代码中设置了独立的set用于去除重复的节点、关系导入;

2、可能会遇到特殊字符,比如名字或地址之间包含',"name":"Marco's Pizza",导入的时候就需要设置为双引号表示字符串。

import json
import csv
import py2neo
from py2neo import Graph,Node,Relationship
from base import openfile
import re
import reverse_geocoder as rg

# connect to local Neo4j
graph=Graph(
    "http://localhost:11006/",
    username="admin",
    password="password"
)

def read_json(path,filename):
    f=open(path+filename,'r',encoding='utf-8')
    unique_cities = set ( )
    unique_categorys =set()
    unique_business=set()
    unique_users=set()
    unique_reviews=set()
    unique_business_cities=set()
    unique_business_categorys=set()
    unique_city_state=set()
    unique_state_country=set()

    for line in f.readlines():
        item=json.loads(line)

        if filename =='business.json':
            business_id = item['business_id']
            name = item['name']
            address = item['address']
            city = item['city']

            if business_id!="" and name !="" and address !="" and business_id not in unique_business:
                try :
                    teststr="merge (Business:Business {business_id:" +'"'+ business_id +'"'+ ", name:" +'"'+ name +'"'+ ", address:" +'"'+ address +'"'+ "})"
                    graph.run (
                        "merge (Business:Business {business_id:" +'"'+ business_id +'"'+ ", name:" +'"'+ name +'"'+ ", address:" +'"'+ address +'"'+ "})" )
                    #there may be many categorys, so each one has a business id
                    for category in item ["categories"].split ( ',' ) :
                        unique_categorys.add ( category )

                        unique_business_categorys.add(business_id+","+category)
                    unique_business.add ( business_id )
                except :
                    print ( "business写入Neo4j报错" )
                    print(teststr)

            lat_longs = {}
            result={}
            if item ["latitude"] and item ["longitude"] :
                lat_longs [item ["business_id"]] = {
                    "lat_long" : (item ["latitude"], item ["longitude"])
                }
            business_ids = list ( lat_longs.keys ( ) )
            for value in lat_longs.values ( ) :
                locations = rg.search ( value ["lat_long"] )
            for business_id, location in zip ( business_ids, locations ) :
                    try:
                        if city not in unique_cities and business_id in unique_business:
                            graph.run (
                                "merge (City:City {city:" +'"'+ city +'"'+ "})" )
                            unique_cities.add(city)
                        # if location["admin1"] not in unique_states:
                            graph.run (
                                "merge (State:State {state:" +'"'+ location["admin1"] +'"'+ "})" )
                            # unique_states.add(location["admin1"])
                        # if location["cc"] not in unique_countrys:
                            graph.run (
                                "merge (Country:Country {country:" +'"'+ location ["cc"] +'"'+ "})" )
                            # unique_countrys.add(location["cc"])
                        if city+","+location ["admin1"] not in unique_city_state and business_id in unique_business:
                            graph.run (
                                    "match (s:State {state:" +'"'+ location [
                                        "admin1"] +'"'+ "}),(c:City {city:" +'"'+ city +'"'+ "})" + "create (c)-[:in_state]->(s)" )
                            unique_city_state.add(city+","+location ["admin1"])
                            # teststr="match (c:Country {country:'" + location ["cc"] + "'}),(s:State {state:'" + location ["admin1"] + "'})" + "create (s)-[:in_country]->(c)"
                        if location ["admin1"]+","+location ["cc"] not in unique_state_country and business_id in unique_business:
                            graph.run (
                                    "match (c:Country {country:" +'"'+ location ["cc"] +'"'+ "}),(s:State {state:" +'"'+ location ["admin1"] +'"'+ "})" + "create (s)-[:in_country]->(c)" )
                            unique_state_country.add(location ["admin1"]+","+location ["cc"])
                        if business_id+city not in unique_business_cities and business_id in unique_business:
                            graph.run (
                                "match (b:Business {business_id:" +'"'+ business_id +'"'+ "}),(c:City {city:" +'"'+ city +'"'+ "})" + "create (b)-[:in_city]->(c)" )
                            unique_business_cities.add(business_id+city)
                    except:
                        print ( "location 写入报错" )
                        continue

        elif filename =='user.json':
            user_id=item["user_id"]
            user_name=item["Rashmi"]
            for friends_id in item["friends"].split(','):
                if user_id not in unique_users:
                    try :
                        graph.run (
                            "merge (User:User {user_id:" +'"'+ user_id +'"'+ ", name:"+'"' + user_name +'"'+ "})" )
                        unique_users.add ( user_id )
                        # create relations for friends should check duplicate
                        graph.run (
                            "match (b:User {user_id:"+'"'+ user_id +'"'+ "}),(a:User {user_id:" +'"'+ friends_id +'"'+ "})" + "create (b)-[:friends]->(a)" )
                    except :
                        print ( "用户写入报错" )
                        continue

        elif filename == 'review.json' :
            review_id=item["review_id"]
            user_id=item["user_id"]
            business_id=item["business_id"]
            text=item["text"]
            stars=item["stars"]
            date1=item["date"]
            try :
                if review_id not in unique_reviews:
                    graph.run (
                        "merge (Review:Review {review_id:" +'"'+ review_id +'"'+ ", stars:" +'"'+ stars +'"'+",date:"+'"'+ date1 +'"'+", text:" +'"'+ text +'"'+ "})" )
                    unique_reviews.add(review_id)
                # create relations for friends should check duplicate
                    graph.run (
                        "match (b:User {user_id:" +'"'+ user_id +'"'+ "}),(a:Review {review_id:" +'"'+ review_id +'"'+ "})" + "create (b)-[:write]->(a)" )
                    graph.run (
                        "match (b:Business {business_id:" +'"'+ business_id +'"'+ "}),(a:Review {review:" +'"'+ review_id +'"'+ "})" + "create (a)-[:reviews]->(b)" )
            except :
                print ( "评论写入报错" )
                continue

    # create category node and relations between business and category
    for category in unique_categorys :
        try :
            graph.run (
                "merge (Category:Category {category:" +'"'+ category +'"'+ "})" )
            # graph.run (
            #     "match (b:Business {business_id:'" + business_id + "'}),(c:Category {category:'" + category + "'})" + "create (b)-[:belong_to]->(c)" )
        except :
            print ( "分类写入报错" )
            continue
    for business_category in unique_business_categorys:
        try :
            graph.run (
                "match (b:Business {business_id:" +'"'+ business_category.split(',')[0] +'"'+ "}),(c:Category {category:" +'"'+ business_category.split(',')[1]  +'"'+ "})" + "create (b)-[:belong_to]->(c)" )
        except :
            print ( "business与分类关系写入报错" )
            continue


if __name__ == '__main__' :
    # read json
    list1=['business.json',
            'review.json',
           'user.json']
    path = 'D:/share/yelp/'
    for filename in list1:
        file=path+filename
        read_json(path,filename)
发布了123 篇原创文章 · 获赞 12 · 访问量 5万+

猜你喜欢

转载自blog.csdn.net/haiziccc/article/details/103968686