1. Correlation Analysis
Analysis of the degree of linear correlation between the strength of the continuous variables
Illustrates initial impression / Pearson correlation coefficients (Pearson correlation coefficient) / Sperman rank correlation coefficient (Spearman correlation coefficient)
# Illustrating initial impression
# (1) a linear correlation between variables
DATAl = pd.Series (np.random.rand (50) * 100 ) .sort_values ()
data2 = pd.Series(np.random.rand(50)*50).sort_values()
DATA3 = pd.Series (np.random.rand (50) * 500) .sort_values (Ascending = False)
# Create three data: data1 and a random number from 0 to 100 in ascending order, data2 of 0 to 50 random numbers in ascending order, data3 and a random number in descending order of 0-500,
Fig = plt.figure (figsize = (10,4 ))
ax1 = fig.add_subplot(1,2,1)
ax1.scatter(data1, data2)
plt.grid ()
# Positive linear correlation
AX2 of = fig.add_subplot (1,2,2 )
ax2.scatter(data1, data3)
plt.grid ()
# Negative linear correlation
# Illustrating initial impression
# (2) scatterplot matrix initial impression multivariate relationships between
Data = pd.DataFrame (np.random.randn (200,4) * 100, Columns = [ ' A ' , ' B ' , ' C ' , ' D ' ])
pd.scatter_matrix(data,figsize=(8,8),
c = 'k',
marker = '+',
diagonal = ' hi ' ,
alpha = 0.8,
range_padding=0.1)
data.head()
2.Pearson correlation coefficient
# Pearson correlation coefficient
DATAl = pd.Series (np.random.rand (100) * 100 ) .sort_values ()
data2 = pd.Series(np.random.rand(100)*50).sort_values()
Data = pd.DataFrame ({ ' VALUE1 ' : data1.values,
' value2 ' : data2.values})
Print (data.head ())
Print ( ' ------ ' )
# create sample data
u1, u2 Data = [ ' VALUE1 ' ] .mean (), Data [ ' value2 ' ] .mean () # calculate the mean
STD1, STD2 = Data [ ' VALUE1 ' ] .std (), Data [ ' value2 ' ] .std () # standard Deviation
Print (' VALUE1 Normality Test: \ n- ' , stats.kstest (Data [ ' VALUE1 ' ], ' NORM ' , (U1, STD1)))
Print ( ' value2 normality test: \ n- ' , stats.kstest ( Data [ ' value2 ' ], ' NORM ' , (U2, STD2)))
Print ( ' ------ ' )
# normality test pValue →> 0.05
Data [ ' (X-U1) * (Y- U2) ' ] = (Data [ ' VALUE1 ' ] - U1) * (Data [ 'value2'] - u2)
Data [ ' (X-U1) ** 2 ' ] = (Data [ ' VALUE1 ' ] - U1) ** 2
Data [ ' (Y-U2) ** 2 ' ] = (Data [ ' value2 ' ] - U2 ) ** 2
Print (data.head ())
Print ( ' ------ ' )
# production evaluation table the Pearson correlation coefficient
R & lt = Data [ ' (X-U1) * (Y-U2) ' ]. SUM () / (np.sqrt (Data [ ' (X-U1) ** 2 ' ] .sum () * Data [ ' (Y-U2) ** 2 ' ] .sum ()))
Print ( 'Pearson correlation coefficient:% 4F. ' % R & lt)
# obtains R & lt
# | R & lt |> 0.8 → highly linear correlation
# Pearson correlation coefficient - Algorithm
DATAl = pd.Series (np.random.rand (100) * 100 ) .sort_values ()
data2 = pd.Series(np.random.rand(100)*50).sort_values()
Data = pd.DataFrame ({ ' VALUE1 ' : data1.values,
' value2 ' : data2.values})
Print (data.head ())
Print ( ' ------ ' )
# create the sample data
data.corr()
# pandas相关性方法:data.corr(method='pearson', min_periods=1) → 直接给出数据字段的相关系数矩阵
# method默认pearson
3.Sperman秩相关系数
# Sperman秩相关系数
data = pd.DataFrame({'智商':[106,86,100,101,99,103,97,113,112,110],
'每周看电视小时数':[7,0,27,50,28,29,20,12,6,17]})
print(data)
print('------')
# 创建样本数据
data.sort_values('智商', inplace=True)
data['range1'] = np.arange(1,len(data)+1)
data.sort_values('每周看电视小时数', inplace=True)
data['range2'] = np.arange(1,len(data)+1)
print(data)
print('------')
# “智商”、“每周看电视小时数”重新按照从小到大排序,并设定秩次index
data['d'] = data['range1'] - data['range2']
data['d2'] = data['d']**2
print(data)
print('------')
# 求出di,di2
n = len(data)
rs = 1 - 6 * (data['d2'].sum()) / (n * (n**2 - 1))
print('Pearson相关系数为:%.4f' % rs)
# 求出rs
# Pearson相关系数 - 算法
data = pd.DataFrame({'智商':[106,86,100,101,99,103,97,113,112,110],
'每周看电视小时数':[7,0,27,50,28,29,20,12,6,17]})
print(data)
print('------')
# 创建样本数据
data.corr(method='spearman')
# pandas相关性方法:data.corr(method='pearson', min_periods=1) → 直接给出数据字段的相关系数矩阵
# method默认pearson