Summary of common processing methods for data analysis

1. View the data structure of each column

def print_col_info(dataset):
    '''print info of every column in dataset:
    detailed info includes:
    1, values
    2, value type num'''
    col_num=dataset.shape[1]
    for i in range(col_num):
        print('\ncol-{} info: '.format(i))
        temp=np.sort(list(set(dataset[:,i])))
        print('values: {}'.format(temp))
        print('values num: {}'.format(temp.shape[0]))

2. Use the map function to uniformly remove spaces for discrete value strings

# 具有离散值列的索引list
str_cols=[1,3,5,6,7,8,9,13,14]
for col in str_cols:
    df.iloc[:,col]=df.iloc[:,col].map(lambda x: x.strip())

Usage of map function: accept function as parameter, or use function as function to return result

def square(x): return x * x
xx = map(square, range(10))
xx = list(xx)

3. Check whether the data has missing values

# 查看数据是否有缺失值--任意列,全部列
df.isnull().any()
df.isnull().all()

Four. Replace a fixed value with nan, and then delete the nan data

# 无返回值,直接替换
# 将?字符串替换为NaN缺失值标志
df.replace("?",np.nan,inplace=True)
# 此处直接删除缺失值样本
df.dropna(inplace=True)

Replace 0 in each column with None

# 先检查是否有'真正的空值'
pima['serum_insulin'].isnull().sum()
# 用None来手动的替换0
pima['serum_insulin'] = pima['serum_insulin'].map(lambda x:x if x != 0 else None)
# 再次检查缺失值数量
pima['serum_insulin'].isnull().sum()

V. Coding discrete data

from sklearn import preprocessing
label_encoder=[] # 放置每一列的encoder
encoded_set = np.empty(df.shape)
for col in range(df.shape[1]):
    encoder=None
    if df.iloc[:,col].dtype==object: # 字符型数据
        encoder=preprocessing.LabelEncoder()
        encoded_set[:,col]=encoder.fit_transform(df.iloc[:,col])
    else:  # 数值型数据
        encoded_set[:,col]=df.iloc[:,col]
    label_encoder.append(encoder)

VI. View the min and max values ​​of continuous data through describe, and zoom in and out of columns with large differences

df.describe()

cols=[2,10,11]
data_scalers=[] # 专门用来放置scaler
for col in cols:
    data_scaler=preprocessing.MinMaxScaler(feature_range=(-1,1)) 
    encoded_set[:,col]=np.ravel(data_scaler.fit_transform(encoded_set[:,col].reshape(-1,1)))
    data_scalers.append(data_scaler)

7. Calculate accuracy, precision, recall and F1 score for the incoming model;

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
	
def score_cal(model, test_X, test_y):
    num_validations=5
    accuracy=cross_val_score(model,test_X,test_y,scoring='accuracy',cv=num_validations)
    print('准确率:{:.2f}%'.format(accuracy.mean()*100))
    precision=cross_val_score(model,test_X,test_y,scoring='precision_weighted',cv=num_validations)
    print('精确度:{:.2f}%'.format(precision.mean()*100))
    recall=cross_val_score(model,test_X,test_y,scoring='recall_weighted',cv=num_validations)
    print('召回率:{:.2f}%'.format(recall.mean()*100))
    f1=cross_val_score(model,test_X,test_y,scoring='f1_weighted',cv=num_validations)
    print('F1  值:{:.2f}%'.format(f1.mean()*100))
    # 3 打印性能报告
    y_pred=model.predict(test_X)
    confusion_mat = confusion_matrix(test_y, y_pred)
    print(confusion_mat) #看看混淆矩阵长啥样
    # 直接使用sklearn打印精度,召回率和F1值
    target_names = ['<=50K', '>50K']
    print(classification_report(test_y, y_pred,target_names=target_names))

8. sklearn random separation test set and training set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 666)

Nine. sklearn's grid search Grid Search

## knn的网格搜索

param_grid = {
    
    
	{
    
    
		'weights':['uniform'],
		'n_neighbors':[i for i in range(1, 11)]
	},
	{
    
    
		'weights':['distance'],
		'n_neighbors':[i for i in range(1, 11)],
		'p':[i for i in range(1,6)]
	}
}

10. Define functions to search for all given parameters and optimize the machine learning pipeline through indicators

# 导入网络搜索模块
from sklearn.model_selection import GridSearchCV

def get_best_model_accuracy(model, params, X, y):
    grid = GridSearchCV(model, # 要搜索的模型
                        params, # 要尝试的参数
                        error_score=0.) # 如果报错,结果为0
    grid.fit(X, y) # 拟合模型和参数
    # 经典的性能指标
    print("Best Accuracy: {}".format(grid.best_score_))
    # 得到最佳准确率的最佳参数
    print("Best Parameters: {}".format(grid.best_params_))
    # 拟合的平均时间(秒)
    print("Average Time to Fit(s): {}".format(grid.cv_results_['mean_fit_time'].mean(), 3))
    # 预测的平均时间(秒)
    # 从该指标可以看出模型在真实世界的性能
    print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

11. Dataframe traversal changes the value of each row

def Ticket_First_Let(x):
    return x[0]
X_train['Ticket_First_Letter'] = X_train['Ticket'].apply(Ticket_First_Let)
X_test['Ticket_First_Letter'] = X_test['Ticket'].apply(Ticket_First_Let)

Guess you like

Origin blog.csdn.net/qq_29027865/article/details/103389943