Summary of common processing methods for data analysis

1. View the data structure of each column

def print_col_info(dataset):
    '''print info of every column in dataset:
    detailed info includes:
    1, values
    2, value type num'''
    col_num=dataset.shape[1]
    for i in range(col_num):
        print('\ncol-{} info: '.format(i))
        temp=np.sort(list(set(dataset[:,i])))
        print('values: {}'.format(temp))
        print('values num: {}'.format(temp.shape[0]))

2. Use the map function to uniformly remove spaces for discrete value strings

# 具有离散值列的索引list
str_cols=[1,3,5,6,7,8,9,13,14]
for col in str_cols:
    df.iloc[:,col]=df.iloc[:,col].map(lambda x: x.strip())

Usage of map function: accept function as parameter, or use function as function to return result

def square(x): return x * x
xx = map(square, range(10))
xx = list(xx)

3. Check whether the data has missing values

# 查看数据是否有缺失值--任意列，全部列
df.isnull().any()
df.isnull().all()

Four. Replace a fixed value with nan, and then delete the nan data

# 无返回值，直接替换
# 将?字符串替换为NaN缺失值标志
df.replace("?",np.nan,inplace=True)
# 此处直接删除缺失值样本
df.dropna(inplace=True)

Replace 0 in each column with None

# 先检查是否有'真正的空值'
pima['serum_insulin'].isnull().sum()
# 用None来手动的替换0
pima['serum_insulin'] = pima['serum_insulin'].map(lambda x:x if x != 0 else None)
# 再次检查缺失值数量
pima['serum_insulin'].isnull().sum()

V. Coding discrete data

from sklearn import preprocessing
label_encoder=[] # 放置每一列的encoder
encoded_set = np.empty(df.shape)
for col in range(df.shape[1]):
    encoder=None
    if df.iloc[:,col].dtype==object: # 字符型数据
        encoder=preprocessing.LabelEncoder()
        encoded_set[:,col]=encoder.fit_transform(df.iloc[:,col])
    else:  # 数值型数据
        encoded_set[:,col]=df.iloc[:,col]
    label_encoder.append(encoder)

VI. View the min and max values of continuous data through describe, and zoom in and out of columns with large differences

df.describe()

cols=[2,10,11]
data_scalers=[] # 专门用来放置scaler
for col in cols:
    data_scaler=preprocessing.MinMaxScaler(feature_range=(-1,1)) 
    encoded_set[:,col]=np.ravel(data_scaler.fit_transform(encoded_set[:,col].reshape(-1,1)))
    data_scalers.append(data_scaler)

7. Calculate accuracy, precision, recall and F1 score for the incoming model;

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
	
def score_cal(model, test_X, test_y):
    num_validations=5
    accuracy=cross_val_score(model,test_X,test_y,scoring='accuracy',cv=num_validations)
    print('准确率：{:.2f}%'.format(accuracy.mean()*100))
    precision=cross_val_score(model,test_X,test_y,scoring='precision_weighted',cv=num_validations)
    print('精确度：{:.2f}%'.format(precision.mean()*100))
    recall=cross_val_score(model,test_X,test_y,scoring='recall_weighted',cv=num_validations)
    print('召回率：{:.2f}%'.format(recall.mean()*100))
    f1=cross_val_score(model,test_X,test_y,scoring='f1_weighted',cv=num_validations)
    print('F1  值：{:.2f}%'.format(f1.mean()*100))
    # 3 打印性能报告
    y_pred=model.predict(test_X)
    confusion_mat = confusion_matrix(test_y, y_pred)
    print(confusion_mat) #看看混淆矩阵长啥样
    # 直接使用sklearn打印精度，召回率和F1值
    target_names = ['<=50K', '>50K']
    print(classification_report(test_y, y_pred,target_names=target_names))

8. sklearn random separation test set and training set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 666)

Nine. sklearn's grid search Grid Search

## knn的网格搜索

param_grid = {
    
    
	{
    
    
		'weights':['uniform'],
		'n_neighbors':[i for i in range(1, 11)]
	},
	{
    
    
		'weights':['distance'],
		'n_neighbors':[i for i in range(1, 11)],
		'p':[i for i in range(1,6)]
	}
}

10. Define functions to search for all given parameters and optimize the machine learning pipeline through indicators

# 导入网络搜索模块
from sklearn.model_selection import GridSearchCV

def get_best_model_accuracy(model, params, X, y):
    grid = GridSearchCV(model, # 要搜索的模型
                        params, # 要尝试的参数
                        error_score=0.) # 如果报错,结果为0
    grid.fit(X, y) # 拟合模型和参数
    # 经典的性能指标
    print("Best Accuracy: {}".format(grid.best_score_))
    # 得到最佳准确率的最佳参数
    print("Best Parameters: {}".format(grid.best_params_))
    # 拟合的平均时间(秒)
    print("Average Time to Fit(s): {}".format(grid.cv_results_['mean_fit_time'].mean(), 3))
    # 预测的平均时间(秒)
    # 从该指标可以看出模型在真实世界的性能
    print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

11. Dataframe traversal changes the value of each row

def Ticket_First_Let(x):
    return x[0]
X_train['Ticket_First_Letter'] = X_train['Ticket'].apply(Ticket_First_Let)
X_test['Ticket_First_Letter'] = X_test['Ticket'].apply(Ticket_First_Let)

Summary of common processing methods for data analysis

1. View the data structure of each column

2. Use the map function to uniformly remove spaces for discrete value strings

3. Check whether the data has missing values

Four. Replace a fixed value with nan, and then delete the nan data

V. Coding discrete data

VI. View the min and max values ​​of continuous data through describe, and zoom in and out of columns with large differences

7. Calculate accuracy, precision, recall and F1 score for the incoming model;

8. sklearn random separation test set and training set

Nine. sklearn's grid search Grid Search

10. Define functions to search for all given parameters and optimize the machine learning pipeline through indicators

11. Dataframe traversal changes the value of each row

Guess you like

VI. View the min and max values of continuous data through describe, and zoom in and out of columns with large differences