有一些准则可以检测离群点,如:正态分布3∂,Z-score 异常值检测,基于MAD的Z-score 异常值检测
以上部分详情与代码请参考:https://blog.csdn.net/weixin_35757704/article/details/89280715
下面是其他的准则:高杠杆值点(帽子矩阵)、DFFITS值、SR学生化残差、cook距离和covratio值,先创建个例子:
import numpy as np
import statsmodels.api as sm
import pandas as pd
from sklearn.metrics import mean_squared_error
x = pd.DataFrame(
np.random.randint(0, 100, size=(100, 4)), columns=['col1', 'col2', 'col3', 'col4']
)
y = np.random.randint(0, 100, size=(100, 1))
fit = sm.OLS(y, x).fit()
print(fit.summary())
pred = fit.predict()
print("RMSE : ", np.sqrt(mean_squared_error(y, pred))) # 计算 RMSE
# 离群点检验
out_points = fit.get_influence()
高杠杆值点
# 高杠杆值点(帽子矩阵)
leverage = out_points.hat_matrix_diag
# 高杠杆值点大于 2(p+1)/n时 被认为是异常点;其中p为维度,n为样本数量
leverage_out = x[leverage > 2 * (x.shape[1]) / x.shape[0]]
DFFITS值
# DFFITS值
dffits = out_points.dffits[0]
# DFFITS统计值大于 2sqrt((p+1)/n) 时被认为是异常点,其中p为维度,n为样本数量
diffts_out = x[dffits > 2 * np.sqrt((x.shape[1] + 1) / x.shape[0])]
SR学生化残差
# 学生化残差
studentized_residual = out_points.resid_studentized_external
# studentized_residual 的绝对值不大于2
studentized_residual_out = x[np.abs(studentized_residual) > 2]
cook距离
# cook距离
cook = out_points.cooks_distance[0]
# 值的绝对值越大越有高概率是异常点
covratio值
# covratio值
covratio = out_points.cov_ratio
# covratio值离 1 越远,越有可能是离群点
全部代码
import numpy as np
import statsmodels.api as sm
import pandas as pd
from sklearn.metrics import mean_squared_error
x = pd.DataFrame(
np.random.randint(0, 100, size=(100, 4)), columns=['col1', 'col2', 'col3', 'col4']
)
y = np.random.randint(0, 100, size=(100, 1))
fit = sm.OLS(y, x).fit()
print(fit.summary())
pred = fit.predict()
print("RMSE : ", np.sqrt(mean_squared_error(y, pred))) # 计算 RMSE
# 离群点检验
out_points = fit.get_influence()
# 高杠杆值点(帽子矩阵)
leverage = out_points.hat_matrix_diag
# 高杠杆值点大于 2(p+1)/n时 被认为是异常点;其中p为维度,n为样本数量
leverage_out = x[leverage > 2 * (x.shape[1]) / x.shape[0]]
# DFFITS值
dffits = out_points.dffits[0]
# DFFITS统计值大于 2sqrt((p+1)/n) 时被认为是异常点,其中p为维度,n为样本数量
diffts_out = x[dffits > 2 * np.sqrt((x.shape[1] + 1) / x.shape[0])]
# 学生化残差
studentized_residual = out_points.resid_studentized_external
# studentized_residual 的绝对值不大于2
studentized_residual_out = x[np.abs(studentized_residual) > 2]
# cook距离
cook = out_points.cooks_distance[0]
# 值的绝对值越大越有高概率是异常点
# covratio值
covratio = out_points.cov_ratio
# covratio值离 1 越远,越有可能是离群点
# 将上面的几种异常值检验统计量与原始数据集合并
contat1 = pd.concat([
pd.Series(leverage, name='leverage'),
pd.Series(dffits, name='dffits'),
pd.Series(studentized_residual, name='rs'),
pd.Series(cook, name='cook'),
pd.Series(covratio, name='covratio'),
], axis=1)