阅读代码—整理学习python数据处理1

import pandas as pd
test_df = pd.read_csv("/Users/isabella/Downloads/test.csv")
train_df = pd.read_csv("/Users/isabella/Downloads/train.csv")

# 查看每一列的数据
train_df.info()
# 查看数据的统计特性
train_df.describe()
# 空值数目
total = train_df.isnull().sum().sort_values(ascending=False)
# 空值占总数的百分比
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
# round(,1)取小数点后面1位      sort_values()从大到小排序
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)

# 当axis = 1的时候,concat就是行对齐,然后将不同列名称的两张表合并
# http://blog.csdn.net/stevenkwong/article/details/52528616

missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
# 查看列名
train_df.columns.values
# Seaborn(sns)官方文档学习笔记
# https://zhuanlan.zhihu.com/p/27816821

# 数据合并
data = [train_df, test_df]

# axis=1 删除某一列
train_df = train_df.drop(['PassengerId'], axis=1)

import re
deck = {"U": 1, "C": 2, "B": 3, "D": 4, "E": 5, "F": 6, "A": 7, "G": 8}

for dataset in data:
    # 填入空值
    dataset['Cabin'][dataset.Cabin.isnull()] = 'U0'
    # re.compile正则表达式  map匹配
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    # map匹配
    dataset['Deck'] = dataset['Deck'].map(deck)
    # 填充空值
    dataset['Deck'] = dataset['Deck'].fillna(0)
    # 变量类型转换
    dataset['Deck'] = dataset['Deck'].astype(int)


for dataset in data:
    # 均值
    mean = dataset["Age"].mean()
    # 标准差
    std = dataset["Age"].std()
    # 空值数目
    is_null = dataset["Age"].isnull().sum()
    # 在确定范围里随机产生值
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # 缺失值填充
    age_slice[np.isnan(age_slice)] = rand_age
    #
    dataset["Age"] = age_slice


data = [train_df, test_df]
for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 22), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 33), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 44), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 44) & (dataset['Age'] <= 55), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 55) & (dataset['Age'] <= 66), 'Age'] = 5
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

# pd.qcut()  cut()函数可以将一个数组中的数据切分成几个部分。将数据分为几个部分,就称为几个面元。cut()函数划分得到的面元,每个面元的数量不同。而qcut()可以保证每个面元的数量相同,且每个面元的区间大小不等
train_df['FareBand'] = pd.qcut(train_df['Fare'], 6)
# 调用GroupBy的mean方法来计算分组平均值
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

# 直接赋值: 引用对象
# copy:深拷贝父对象(一级目录),子对象(二级目录)不拷贝,还是引用
X_test  = test_df.drop("PassengerId", axis=1).copy()

# stochastic gradient descent (SGD) learning
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)

sgd.score(X_train, Y_train)
# round 返回浮点数x的四舍五入值
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)


print(round(acc_sgd,2,), "%")

#
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_prediction
    })

# 将DataFrame存储为csv,index表示是否显示行名
submission.to_csv('submission.csv', index=False)


猜你喜欢

转载自blog.csdn.net/qq_18310041/article/details/79274612