python-dataframe数据预处理(二)之txt数据集文件合并

上篇博文中写到了怎么将混乱的txt样本集按类别分割成几个TXT文件。这篇讲解如何从几个文件中抽取指定个数的样本组成训练集,并对数据标签进行one-hot编码转化。

# -*- coding: utf-8 -*-
'''
有data1、data2、data3三个csv文件,分别存放了三个类别的点数据。一行为一个数据,一列为一个特征,数据共46列,1-45为特征,46列为标签。
此文件完成数据合并功能,将三个类别的数据合成一个数组,然后在按比例划分出训练集和测试集。
对x进行数据整体归一化
对y进行one-hot编码转换,以供机器学习分类使用
'''
import pandas as pd
import  numpy as np
from sklearn.cross_validation import train_test_split
import os
from keras.utils import np_utils
from sklearn import preprocessing

data1 = pd.read_csv(r'E:\py3\DataPro\yanxing510\segm\dataC1.csv')
#xtrain_4 = data4.iloc[:,0:44]
data1['class'] = data1['class'] .replace('C1',0)               #将类别转换成数字    由C1到0
train_1,test_1 = train_test_split(data1,test_size=1/21)            #将数据分成训练集和测试集

data2 = pd.read_csv(r'E:\py3\DataPro\yanxing510\segm\dataC2.csv')
data2['class'] = data2['class'] .replace('C2',1)
#s = data2.sample(1050)         #随机取出指定个数的样本
train_2,test_2 = train_test_split(data2,test_size=1/21)

data3 = pd.read_csv(r'E:\py3\DataPro\yanxing510\segm\dataC3.csv')
data3[('class')] = data3['class'] .replace('C3',2)
#print(data3['class'][:20])     #验证C2已经被 1 取代
#s1 = data3.sample(1050)
train_3,test_3 = train_test_split(data3,test_size=1/21)

#-----------------------------------------------------------------------------------------
train = pd.concat([train_1,train_2,train_3])    #从上往下将几个类别的数据结合在一起

#print(train[-10:-3])
train = np.array(train)                     #转成数组
#print(train.shape[1])
np.random.shuffle(train)                     #随机打乱
x_train = train[:,0:45]                         #取x  第0-44列
y_train = train[:,45]                         #取y    第46列--标签列
#print(y_train.sum())     #++++++++++++++++++++++++++++++++++++++++++++++++

test = pd.concat([test_1,test_2,test_3])

test = np.array(test)
np.random.shuffle(test)
x_test = test[:,0:45]              #取45列作为x (0-44)
y_test= test[:,45]                #非one-hot形式的标签
#print(y_test.sum())

X_train = preprocessing.scale(x_train.astype('float'))            #数据预处理---scale标准化   0.42   减去均值除以标准差
#pd.DataFrame(X_train).to_csv(r'./segm'+"//" +"train.csv",index=False)
#X_train = preprocessing.normalize(X=x_train, norm='l2', axis=0);                0.38
#X_train = preprocessing.minmax_scale(x_train)
Y_train = np_utils.to_categorical(y_train,3)     #将y转换成one-hot的形式

X_test = preprocessing.scale(x_test.astype('float'))            #数据预处理---scale标准化
#pd.DataFrame(X_test).to_csv(r'./segm'+"//" +"test.csv",index=False)
#X_test = preprocessing.normalize(X=x_test, norm='l2', axis=0);
#X_test = preprocessing.minmax_scale(x_test)
Y_test = np_utils.to_categorical(y_test,3)     #将y转换成one-hot的形式,n*5

猜你喜欢

转载自blog.csdn.net/tanlangqie/article/details/80471497