对原始数据进行采样过滤

# coding: utf-8
import random
import time
import csv
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

f1 = open("raw_data.csv", "a+", encoding='utf-8')
writer_csv_1 = csv.writer(f1)
header = ['Nodeid1','Nodeid2','author_degree1','author_degree2','No','isBD']
writer_csv_1.writerow(header)

data=pd.DataFrame(pd.read_csv('/home/henson/Desktop/huanping/huanping.csv_EDGE_NBD.csv',encoding='gb18030'))   #数据集路径
data.head()

X = np.array(data[['Nodeid1','Nodeid2','author_degree1','author_degree2','No','isBD']])

Nodeid1=X[:,0]
Nodeid2=X[:,1]
author_degree1=X[:,2]
author_degree2=X[:,3]
No=X[:,4]
isBD=X[:,5]

data1=[]

for i in range(0,len(isBD)):
    data1.append((Nodeid1[i],Nodeid2[i],author_degree1[i],author_degree2[i],No[i],isBD[i]))

writer_csv_1.writerows(data1)
f1.close()

""" 把原始数据再写一遍到csv,保证数据的精度一直,才能做比较
"""




f2 = open("sele.csv", "a+", encoding='utf-8')
writer_csv_2 = csv.writer(f2)
writer_csv_2.writerow(header)

data2=[]

for j in range(0,len(isBD)):
    if isBD[j] == 1:
        data2.append((Nodeid1[j], Nodeid2[j], author_degree1[j], author_degree2[j], No[j], isBD[j]))


writer_csv_2.writerows(data2)
f2.close()

""" 写入label为1的数据,精度和raw_data一致,方便做比较
"""

f3 = open("sele.csv", "a+", encoding='utf-8')
writer_csv_3 = csv.writer(f3)

data3=pd.DataFrame(pd.read_csv('sele.csv',encoding='utf-8'))
X1 = np.array(data3[['author_degree1','author_degree2','No']])
isBD1 = np.array(data3[['isBD']])
degreex1=X1[:,0]
degreex2=X1[:,1]
Nox=X1[:,2]
isBD1=X1[:,]

data4=pd.DataFrame(pd.read_csv('raw_data.csv',encoding='utf-8'))
data4.head()

nodeid1=np.array((data4[['Nodeid1']]))
nodeid2=np.array((data4[['Nodeid2']]))
nodeid1=nodeid1[:,0]
nodeid2=nodeid2[:,0]

X = np.array(data4[['author_degree1','author_degree2','No']])
isBD = np.array(data4[['isBD']])
data3=[]

degree1=X[:,0]
degree2=X[:,1]
No=X[:,2]
isBD=isBD[:,0]

"""
if isBD[969] == 0:
    a = X[969, :]
    print(a)
    for j in range(0, 1):
        b = X1[337, :]
        print("b:",b)
        if (a == b).all():
            print("yes,equal")    #选取个别检验是否相等
            c = 0
            break
        else:
            c = 1

"""

for i in range(0,len(isBD)):
    #c = 1
    if isBD[i] == 0:
        a=X[i,:]
        for j in range(0,len(isBD1)):
            b = X1[j, :]
            if (a == b).all():
                #print(a)
                c=0
                break
            else:c=1

        if c==1:
            data3.append((nodeid1[i],nodeid2[i],X[i,0],X[i,1],X[i,2],isBD[i]))

#print(data3)
writer_csv_3.writerows(data3)
f3.close()

""" 直接追加到sele.csv,合并0和1的数据

"""

把原始数据再写一遍到csv,保证数据的精度一直,才能做比较,生成raw_data.csv,先选出label为1的数据,精度和raw_data一致,生成sele.csv方便做比较,从raw_data.csv与sele.csv反选给出输入特征向量与label=1不一样的数据,(即征向量与label=1不一样,且label=0的数据),数据生成追加到sele.csv,得到的数据可作为训练集训练生成模型

猜你喜欢

转载自blog.csdn.net/hensonwells/article/details/79514260
今日推荐