Titanic实战

参考书籍:《深度学习原理与Tendorflow实战》(喻俨 莫瑜)

其中,根据编程中实际情况改了部分代码,最终能跑通

代码:

 
 
import tensorflow as tf
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#列举当前文件夹下的文件
import os
print(os.listdir("../input"))

#从csv文件中读取数据
data = pd.read_csv('../input/train.csv')
#可通过DataFrame.info()方法查看数据的概况
data.info()

#取部分取特征字段用于分类,并将所有缺失的字段填充为0
data['Sex'] = data['Sex'].apply(lambda s:1 if s == 'male' else 0)
data = data.fillna(0)
dataset_X = data[['Sex','Age','Pclass','SibSp','Parch','Fare']]
dataset_X = dataset_X.as_matrix()

#两种分类分别是幸存和死亡,‘Survived’字段是其中一种分类的标签
#新增‘Deceased’字段表示第二种分类的标签,取值为‘Survived’字段取非
data['Deceased'] = data['Survived'].apply(lambda s:int (not s))
dataset_Y = data[['Deceased','Survived']]
dataset_Y = dataset_Y.as_matrix()


from sklearn.model_selection import train_test_split
#使用sklearn的train_test_split函数将标记数据切分为“训练数据集和验证数据集”
#将全部标记数据随机洗牌后切分,其中验证数据占20%,由test_size参数指定
X_train,X_test,y_train,y_test = train_test_split(dataset_X,dataset_Y,test_size=0.2,random_state=42)


#声明数据占位符
#shape参数的第一个元素为None,表示可以同时放入任意条记录
X = tf.placeholder(tf.float32,shape=[None,6])
y = tf.placeholder(tf.float32,shape=[None,2])

#声明变量
W = tf.Variable(tf.random_normal([6,2]), name='weights')
b = tf.Variable(tf.zeros([2]), name = 'bias')


#构造前向传播计算图--逻辑回归公式
y_pred = tf.nn.softmax(tf.matmul(X, W) + b)


#声明代价函数
#使用交叉熵作为代价函数
cross_entropy = -tf.reduce_sum(y * tf.log(y_pred + 1e-10),reduction_indices=1)
#批量样本的代价值为所有样本交叉熵的平均值
cost = tf.reduce_mean(cross_entropy)

#加入优化算法
#随机梯度下降最小化代价,系统自动构建反向传播部分的计算图
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)

'''
至此,计算图的声明过程就完成了
'''


with tf.Session() as sess:
	#初始化所有变量,必须最先执行
    tf.global_variables_initializer().run()
    
    #以下为训练迭代,迭代10轮
    for epoch in range(10):
        total_loss = 0.
        for i in range(len(X_train)):
            feed = {X: [X_train[i]] , y:[y_train[i]]}
            #通过session.run接口触发执行
            _,loss = sess.run([train_op,cost],feed_dict=feed)
            total_loss += loss
        print('Epoch: %04d, total loss=%.9f' % (epoch + 1,total_loss))
    print('Training complete!')
    
	#评估校验数据集上的准确率
    pred = sess.run(y_pred, feed_dict={X:X_test})
    correct = np.equal(np.argmax(pred,1), np.argmax(y_test,1))
    accuracy = np.mean(correct.astype(np.float32))
    print("Accuracy on validation set: %.9f" % accuracy)
	

'''
至此,训练完成并评估了准确率
接下来需要对数据结果进行预测
'''
testdata = pd.read_csv('../input/test.csv')
testdata = testdata.fillna(0)
testdata['Sex'] = testdata['Sex'].apply(lambda s:1 if s == 'male' else 0)
X_test = testdata[['Sex','Age','Pclass','SibSp','Parch','Fare']]

#开启Session进行预测
saver = tf.train.Saver()
#with tf.Session() as sess:
	#加载模型存档
#	saver.restore(sess,'model.ckpt')

	
#正向传播计算
predictions = np.argmax(sess.run(y_pred,feed_dict={X: X_test}),1)

#构造提交结果的数据结构,并将结果存储为csv文件
submission = pd.DataFrame({
	"PassengerId":testdata["PassengerId"],
	"Survived": predictions
	})
submission.to_csv("titanic-submission.csv",index=False)


实验结果:


猜你喜欢

转载自blog.csdn.net/qq_34062105/article/details/79920407