数据预处理加简单的对数回归

以下是数据预处理代码

import pandas as pd
import numpy as np
#================================读取数据=====================================================
data = pd.read_csv('data.csv',encoding = 'gbk')
# data.info()#获取每一列的索引以及数据的条数和类型
# print(data.shape)#获取dataframe的形状
#=================================将根据要求将标记y提取出来======================================
# y = data.pop('status')
print(data.shape)
# print(y)
#===========================筛选并删除取值唯一的特征,生成新的X================================
features = data.columns  #获取特征的名称
#手动对特征进行分类
kindlist = ['bank_card_no','is_high_user','first_transaction_time','reg_preference_for_trad','source','latest_query_time',
            'loans_latest_time']
numlist = ['low_volume_percent','middle_volume_percent','trans_amount_increase_rate_lately','trans_activity_month',
           'trans_activity_day','rank_trad_1_month','top_trans_count_last_1_month','avg_price_top_last_12_valid_month']
idlist = ['custid','trade_no','id_name']
intnumlist = features.drop(kindlist)
intnumlist = intnumlist.drop(numlist)
intnumlist = intnumlist.drop(idlist)
intnumlist = intnumlist.drop(['status'])
#检查各个类别的数目加起来是否为89
print(len(intnumlist),len(numlist),len(idlist),len(kindlist))

uniquefeat = []  #建立一个空列表用来存储只有一个取值的特征
#for循环寻找每一列中取值唯一的那一列
data['student_feature'].fillna(0,inplace = True)
# print(data['student_feature'])
for feat in features:
    if len(data[feat].unique()) == 1:
        uniquefeat.extend([feat])
print (uniquefeat)
#用均值填充数值型缺失值,用众数填充类型的缺失值、id类型的不填

for name in intnumlist:
    data[name].fillna(int(data[name].mean()),inplace = True)
for name in numlist:
    data[name].fillna(data[name].mean(),inplace=True)
for name in kindlist:
    data[name].fillna(method= 'ffill',inplace=True)
for feat in numlist:
    if len(data[feat])<data.shape[0]:
        print('{} is null'.format(feat))
for feat in intnumlist:
    if len(data[feat])<data.shape[0]:
        print('{} is null'.format(feat))
for feat in kindlist:
    if len(data[feat])<data.shape[0]:
        print('{} is null'.format(feat))
for feat in idlist:
    if len(data[feat])<data.shape[0]:
        print('{} is null'.format(feat))
# 删除掉所有样本取值相同的特征
data.drop(uniquefeat,axis = 1,inplace=True)
data.drop(['id_name'],axis = 1, inplace= True)
data.to_csv('newdata.csv',encoding = 'gbk',index= False)

以下是对数几率回归代码

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
data = pd.read_csv('newdata.csv',encoding = 'gbk')
# print(data[data.isnull().values==True])
# print(data.columns)
# print(data['status'])
y = data.pop('status')
data.drop(['custid','trade_no','first_transaction_time','latest_query_time','loans_latest_time','reg_preference_for_trad'],axis = 1,inplace = True)

# # print(data[data.isnull().values==True])
# 数据归一化
data_scaled = preprocessing.scale(data)
# print(np.isnan(data_scaled).any())
#划分训练集测试集
data_train, X_test, y_train, y_test = train_test_split(data_scaled, y, test_size=0.3,random_state=2019)
# 构建模型
lr = LogisticRegression(random_state = 2018)
# print(np.isnan(data_train).any())
lr.fit(data_train,y_train)
print(lr.score(X_test,y_test))

猜你喜欢

转载自blog.csdn.net/Mr__kou/article/details/88885418