第一部分 转换数据,把从一个文件夹下的所有文件写到一个大文件
import os
import pandas as pd
import sys
#该模块,实现转换数据的功能
#全局变量
DATA=pd.DataFrame()
def readfile(filepath):
externfiles=os.listdir(filepath)
externfiles.sort()
print(externfiles)
#遍历文件下的所有文件
for files in externfiles:
interfiles=os.listdir(filepath+"\\"+files)
interfiles.sort()
for datafile in interfiles:
global DATA
size = pd.read_csv(filepath + '\\' + files + '\\' + datafile, nrows=0)
# size=list(size.columns)
# size=str(size[0])
# size=size.split(" ")
# size=size[0].split(":")
# size=float(size[1])
size = float(((str(list(size.columns)[0]).split(" "))[0].split(":"))[1])
data = pd.read_csv(filepath+'\\'+files+'\\'+datafile, header=1)
data = data[['step_index','gap','r']]
data['swarmsize'] = size
data=data[['swarmsize','step_index','gap','r']]
DATA=pd.concat([DATA,data],axis=0,ignore_index=True)
DATA.to_csv("D:\\SwarmData\\trans_result\\trans.csv") #写入到文件
filepath="D:\\SwarmData\\result"
if len(sys.argv) > 1:
filepath = sys.argv[1]
print(filepath)
readfile(filepath)
第二部分 从大文件中挑选出,需要处理的数据
import pandas as pd
import os
import sys
#该模块,实现从转换数据中,选出step=250的数据
pickfilepath="D:\\SwarmData\\trans_result\\trans_step250.csv"
readfilepath="D:\\SwarmData\\trans_result\\trans_step.csv"
def pickfile(pickfilepath):
pickdata = pd.read_csv(readfilepath)
pickdata.drop('Unnamed: 0', axis=1, inplace=True)
pickdata.to_csv(pickfilepath)
def readfile(filepath):
data= pd.read_csv(filepath)
data_step=data.loc[data['step_index']==250]
data_step.drop('Unnamed: 0',axis=1,inplace=True)
data_step.to_csv(readfilepath)
pickfile(pickfilepath)
filepath = "D:\\SwarmData\\trans_result\\trans.csv"
if len(sys.argv) > 1:
filepath = sys.argv[1]
print(filepath)
readfile(filepath)
第三部分 回归系数 计算,利用交叉验证确定最佳γ和拟合平面的显示
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
from numpy import *
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import numpy as np
#全局变量
w=zeros((3,1))
data = pd.DataFrame()
traindata=pd.DataFrame()
testdata = pd.DataFrame()
L=[]
xArr=[]
yArr=[]
def readfile(filepath):
global data
global testdata
global L
global w
global xArr
global yArr
data = pd.read_csv(filepath)
data.drop('Unnamed: 0', axis=1, inplace=True)
data['const1'] = 1
#取出的数据随机分为test和train,test两百条,train1800条
# traindata=data.copy()
# for i in np.arange(200):
# index = np.random.randint(0, 1999 - i)
# while index in L:
# index = np.random.randint(0, 1999 - i)
# L.append(index)
# test = (pd.DataFrame(traindata.iloc[index])).T
# testdata = pd.concat([testdata, test], axis=0, ignore_index=True)
# traindata.drop([index], inplace=True)
# swarmsize,gap为输入,r输出
xArr = data[['swarmsize', 'gap']].as_matrix()
yArr = data['r'].tolist()
# xMat = data[['swarmsize', 'gap', 'const1']].as_matrix()
# yMat = data[['r']].as_matrix()
# xTx = np.mat(np.dot(xMat.T,xMat))
# if np.linalg.det(xTx) == 0.0:
# print("This matrix is singular, cannot do inverse")
# return
# ws = xTx.I * np.mat(np.dot(xMat.T , yMat))
# print(ws)
#计算误差函数
def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
return ((yArr-yHatArr)**2).sum()
#求系数矩阵函数
# r=[size,gap]*ws+b
#求出ws
def ridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx + eye(shape(xMat)[1])*lam
if linalg.det(denom) == 0.0:
print ("This matrix is singular, cannot do inverse")
return
ws = denom.I * (xMat.T*yMat)
return ws
def ridgeTest(xArr,yArr):
# print(xArr)
xMat = mat(xArr)
# print(xMat.shape)
yMat=mat(yArr).T
#以下规整数据,相当于归一化操作
#求均值yMean
yMean = mean(yMat,0)
yMat = yMat - yMean #to eliminate X0 take mean off of Y
#regularize X's
xMeans = mean(xMat,0) #calc mean then subtract it off
#求方差
xVar = var(xMat,0) #calc variance of Xi then divide by it
xMat = (xMat - xMeans)/xVar
numTestPts = 30
#wmat存储30个系数矩阵ws
wMat = zeros((numTestPts,shape(xMat)[1]))
for i in range(numTestPts):
# print(xMat)
ws = ridgeRegres(xMat,yMat,exp(i-10))
# print(ws)
wMat[i,:]=ws.T
# print(wMat)
return wMat
def crossValidation(xArr,yArr,numVal=10):
#m数据条数
m = len(yArr)
indexList = list(range(m))
#创建误差矩阵
errorMat = zeros((numVal,30)) #create error mat 30columns numVal rows
for i in range(numVal):
trainX=[]; trainY=[]
testX = []; testY = []
#切分数据 其中train占90% test占10%
random.shuffle(indexList)
for j in range(m):#create training set based on first 90% of values in indexList
if j < m*0.9:
trainX.append(xArr[indexList[j]])
trainY.append(yArr[indexList[j]])
else:
testX.append(xArr[indexList[j]])
testY.append(yArr[indexList[j]])
wMat = ridgeTest(trainX,trainY) #get 30 weight vectors from ridge
for k in range(30):#loop over all of the ridge estimates
matTestX = mat(testX);
matTrainX=mat(trainX)
meanTrain = mean(matTrainX,0)
varTrain = var(matTrainX,0)
matTestX = (matTestX-meanTrain)/varTrain #regularize test with training params
yEst = matTestX * mat(wMat[k,:]).T + mean(trainY)#test ridge results and store
#求出vMat中30组ws,所对应的误差
errorMat[i,k]=rssError(yEst.T.A,array(testY))
#print errorMat[i,k]
meanErrors = mean(errorMat,0)#calc avg performance of the different ridge weight vectors
minMean = float(min(meanErrors))
#选出30组岭回归中最优矩阵,也就是误差最小的矩阵
bestWeights = wMat[nonzero(meanErrors==minMean)]
#can unregularize to get model
#when we regularized we wrote Xreg = (x-meanX)/var(x)
#we can now write in terms of x not Xreg: x*w/var(x) - meanX/var(x) +meanY
#ws是2×1的矩阵 这里将ws加上常数项 转为3×1的矩阵方便计算
# r=[size,gap]*ws+b ==> r=[size,gap]*w
xMat = mat(xArr);
yMat=mat(yArr).T
meanX = mean(xMat,0);
varX = var(xMat,0)
unReg = bestWeights/varX
a=unReg.tolist()
w[0][0]=a[0][0]
w[1][0]=a[0][1]
w[2][0]=float(-1*sum(multiply(meanX,unReg)) + mean(yMat))
print(w)
def show():
xMat = data[['swarmsize', 'gap', 'const1']].as_matrix()
yMat = data[['r']].as_matrix()
size60 = data.loc[data['swarmsize'] == 60]
size80 = data.loc[data['swarmsize'] == 80]
size100 = data.loc[data['swarmsize'] == 100]
size120 = data.loc[data['swarmsize'] == 120]
size140 = data.loc[data['swarmsize'] == 140]
size160 = data.loc[data['swarmsize'] == 160]
size180 = data.loc[data['swarmsize'] == 180]
size200 = data.loc[data['swarmsize'] == 200]
gap = np.array(data['gap'])
swarmsize = np.array(data['swarmsize'])
r = np.array(yMat)
gap60 = np.array(size60['gap'])
swarmsize60 = np.array(size60['swarmsize'])
r60 = np.array(size60['r'])
gap80 = np.array(size80['gap'])
swarmsize80 = np.array(size80['swarmsize'])
r80 = np.array(size80['r'])
gap100 = np.array(size100['gap'])
swarmsize100 = np.array(size100['swarmsize'])
r100 = np.array(size100['r'])
gap120 = np.array(size120['gap'])
swarmsize120 = np.array(size120['swarmsize'])
r120 = np.array(size120['r'])
gap140 = np.array(size140['gap'])
swarmsize140 = np.array(size140['swarmsize'])
r140 = np.array(size140['r'])
gap160 = np.array(size160['gap'])
swarmsize160 = np.array(size160['swarmsize'])
r160 = np.array(size160['r'])
gap180 = np.array(size180['gap'])
swarmsize180 = np.array(size180['swarmsize'])
r180 = np.array(size180['r'])
gap200 = np.array(size200['gap'])
swarmsize200 = np.array(size200['swarmsize'])
r200 = np.array(size200['r'])
# y = np.array(yOut)
ax = plt.axes(projection='3d')
ax.set_xlim(0, 100)
ax.set_ylim(60, 200)
ax.set_zlim(0, 320)
ax.set_title('step_index=250', fontsize=15)
ax.set_xlabel('gap', fontsize=15)
ax.set_ylabel('swarmsize', fontsize=15)
ax.set_zlabel('r', fontsize=15)
ax.scatter3D(gap60, swarmsize60, r60, s=5, color='b', label='size:60')
ax.scatter3D(gap80, swarmsize80, r80, s=5, color='g', label='size:80')
ax.scatter3D(gap100, swarmsize100, r100, s=5, color='c', label='size:100')
ax.scatter3D(gap120, swarmsize120, r120, s=5, color='y', label='size:120')
ax.scatter3D(gap140, swarmsize140, r140, s=5, color='k', label='size:140')
ax.scatter3D(gap160, swarmsize160, r160, s=5, color='slategrey', label='size:160')
ax.scatter3D(gap180, swarmsize180, r180, s=5, color='m', label='size:180')
ax.scatter3D(gap200, swarmsize200, r200, s=5, color='r', label='size:200')
ax.legend(loc='best')
# X, Y = np.meshgrid(gap, swarmsize)
# ax.plot_surface(X,Y,y, rstride=1, cstride=1, cmap='jet')
X = np.arange(0, 100, 1)
Y = np.arange(60, 200, 1)
X, Y = np.meshgrid(X, Y) # 将坐标向量变为坐标矩阵,列为x的长度,行为y的长度
Z = float(w[1][0]) * X + float(w[0][0]) * Y + float(w[2][0])
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, linewidth=0, color='w')
plt.show()
def testfunc():
# size60 = testdata.loc[testdata['swarmsize'] == 60]
# size80 = testdata.loc[testdata['swarmsize'] == 80]
# size100 = testdata.loc[testdata['swarmsize'] == 100]
# size120 = testdata.loc[testdata['swarmsize'] == 120]
# size140 = testdata.loc[testdata['swarmsize'] == 140]
# size160 = testdata.loc[testdata['swarmsize'] == 160]
# size180 = testdata.loc[testdata['swarmsize'] == 180]
# size200 = testdata.loc[testdata['swarmsize'] == 200]
size60 = data.loc[data['swarmsize'] == 60]
size80 = data.loc[data['swarmsize'] == 80]
size100 = data.loc[data['swarmsize'] == 100]
size120 = data.loc[data['swarmsize'] == 120]
size140 = data.loc[data['swarmsize'] == 140]
size160 = data.loc[data['swarmsize'] == 160]
size180 = data.loc[data['swarmsize'] == 180]
size200 = data.loc[data['swarmsize'] == 200]
def test(size,num):
xtestMat= size[['swarmsize', 'gap', 'const1']].as_matrix()
ytestMat = (size[['r']].as_matrix()).tolist()
yOut = (np.dot(xtestMat, w)).tolist()
sum = 0
# print('num:', xtestMat.shape[0])
L=[]
numz=0
numf=0
for i in range(len(ytestMat)):
L.append(abs(float(ytestMat[i][0]) - float(yOut[i][0])))
if float(ytestMat[i][0]) - float(yOut[i][0])>0:
numz+=1
else:
numf+=1
mean=np.mean(L)
var=np.var(L)
# print('swarmsize', num, ":","共",numz+numf,"个 ","正数:",numz,"个"," 负数:",numf,"个")
# print('误差期望:', mean,' 误差方差:',var)
print('swarmsize', num, ': 误差期望:', mean, ' 误差方差:', var)
test(size60,60)
test(size80,80)
test(size100,100)
test(size120,120)
test(size140,140)
test(size160,160)
test(size180, 180)
test(size200, 200)
filepath = "D:\\SwarmData\\ridge\\trans_result\\trans_step250.csv"
if len(sys.argv) > 1:
filepath = sys.argv[1]
# 取出数据
readfile(filepath)
# 岭回归主程序
crossValidation(xArr,yArr)
#可视化
show()
# 验证函数
#testfunc()
拟合效果图