Previous article: "Data Quality Inspection - Theory" mainly introduces the basic ideas and methods of data quality inspection. As a supplement, this article provides specific implementation methods from the perspective of Python actual combat.
Continuing the above, we will still describe from the four aspects of duplicate value checking, missing value checking, data skew problem, and outlier checking.
1. Environment introduction
Version: python2.7
Tools: Spyder
Developer: hbsygfz
2. Dataset introduction
Dataset: dataset.xlsx
3. Code implementation
3.1 Import related libraries
import pandas as pd
3.2 Read the dataset
dataset = pd.read_excel("/labcenter/python/dataset.xlsx")
discColList = ['col4','col7']
contColList = ['col1','col2','col3','col5','col6']
3.3 Duplicate value check
Main statistical indicators: the number of duplicate records, the number of unique field values.
### (1)重复记录数
def dupRowsCheck(df):
dupRows = df.duplicated().sum()
return dupRows
### (2)字段唯一值数
def uiqColValCheck(df):
# 记录数,变量数
m,n = df.shape
uiqDf = pd.DataFrame(index=df.columns,columns=['rows','uiqCnt'])
uiqDf['rows'] = m
for j in range(n):
ser = df.iloc[:,j]
name = df.columns[j]
uiqCnt = len(ser.unique())
uiqDf.loc[name,'uiqCnt'] = uiqCnt
return uiqDf
Execution and result:
dupRowsCheck(dataset)
Out[95]: 0
uiqColValCheck(dataset)
Out[96]:
rows uiqCnt
col1 10 10
col2 10 9
col3 10 10
col4 10 3
col5 10 9
col6 10 5
col7 10 2
3.4 Missing value checking
Main statistical indicators: the number of records with null values in the field.
def missingCheck(df):
# 记录数,变量数
m,n = df.shape
rowsSer = pd.Series(index=df.columns)
rowsSer.name = 'rows'
# 空值记录数
nullCntSer = df.isnull().sum()
nullCntSer.name = 'nullCnt'
# 合并结果
missDf = pd.concat([rowsSer,nullCntSer],axis=1)
missDf['rows'] = m
return missDf
Execution and result:
missingCheck(dataset)
Out[97]:
rows nullCnt
col1 10 0
col2 10 1
col3 10 0
col4 10 0
col5 10 1
col6 10 0
col7 10 0
3.5 Data skew problem
Main statistical indicators: the number of records, the number of categories, the maximum number of category records, and the proportion of the maximum category records.
def skewCheck(df,discList,contList,bins):
# 离散型变量类别统计
new_df1 = df[discList]
skewDf1 = pd.DataFrame(index=discList,columns=['rows','classCnt','mostClassCnt','mostClassRio'])
m1,n1 = new_df1.shape
for j in range(n1):
ser = new_df1.iloc[:,j]
name = new_df1.columns[j]
freqSer = pd.value_counts(ser)
skewDf1.loc[name,'rows'] = m1
skewDf1.loc[name,'classCnt'] = len(freqSer)
skewDf1.loc[name,'mostClassCnt'] = freqSer[0]
skewDf1.loc[name,'mostClassRio'] = freqSer[0] * 1.00 / m1
# 连续型变量分箱统计
new_df2 = df[contList]
skewDf2 = pd.DataFrame(index=contList,columns=['rows','classCnt','mostClassCnt','mostClassRio'])
m2,n2 = new_df2.shape
for j in range(n2):
ser = new_df2.iloc[:,j]
name = new_df2.columns[j]
freqSer = pd.value_counts(pd.cut(ser,bins))
skewDf2.loc[name,'rows'] = m2
skewDf2.loc[name,'classCnt'] = len(freqSer)
skewDf2.loc[name,'mostClassCnt'] = freqSer[0]
skewDf2.loc[name,'mostClassRio'] = freqSer[0] * 1.00 / m2
# 合并结果
skewDf = pd.concat([skewDf1,skewDf2],axis=0)
return skewDf
Execution and result:
skewCheck(dataset,discColList,contColList,4)
Out[98]:
rows classCnt mostClassCnt mostClassRio
col4 10 3 5 0.5
col7 10 2 6 0.6
col1 10 4 3 0.3
col2 10 4 3 0.3
col3 10 4 4 0.4
col5 10 4 3 0.3
col6 10 4 1 0.1
3.6 Outlier Checking
Main statistical indicators: maximum, minimum, mean, standard deviation, coefficient of variation, the number of records greater than the mean + 3 times the standard deviation, the number of records less than the mean - 3 times the standard deviation, greater than the upper quartile + 1.5 times The number of records in the interquartile range, the number of records in the interquartile range less than -1.5 times the lower quartile, the number of positive records, the number of zero records, and the number of negative records.
### (1)异常值统计
def outCheck(df,contList):
new_df = df[contList]
resDf = new_df.describe()
resDf.loc['cov'] = resDf.loc['std'] / resDf.loc['mean'] #计算变异系数
resDf.loc['mean+3std'] = resDf.loc['mean'] + 3 * resDf.loc['std'] #计算平均值+3倍标准差
resDf.loc['mean-3std'] = resDf.loc['mean'] - 3 * resDf.loc['std'] #计算平均值-3倍标准差
resDf.loc['75%+1.5dist'] = resDf.loc['75%'] + 1.5 * (resDf.loc['75%'] - resDf.loc['25%']) #计算上四分位+1.5倍的四分位间距
resDf.loc['25%-1.5dist'] = resDf.loc['25%'] - 1.5 * (resDf.loc['75%'] - resDf.loc['25%']) #计算下四分位-1.5倍的四分位间距
# 3segma检查
segmaSer1 = new_df[new_df > resDf.loc['mean+3std']].count() #平均值+3倍标准差
segmaSer1.name = 'above3SegmaCnt'
segmaSer2 = new_df[new_df < resDf.loc['mean-3std']].count() #平均值-3倍标准差
segmaSer2.name = 'below3SegmaCnt'
# 箱线图检查
boxSer1 = new_df[new_df > resDf.loc['75%+1.5dist']].count() #上四分位+1.5倍的四分位间距
boxSer1.name = 'aboveBoxCnt'
boxSer2 = new_df[new_df < resDf.loc['25%-1.5dist']].count() #下四分位-1.5倍的四分位间距
boxSer2.name = 'belowBoxCnt'
# 合并结果
outTmpDf1 = pd.concat([segmaSer1,segmaSer2,boxSer1,boxSer2],axis=1)
outTmpDf2 = resDf.loc[['max','min','mean','std','cov']]
outDf = pd.concat([outTmpDf2.T,outTmpDf1],axis=1)
return outDf
### (2)正负分布检查
def distCheck(df,contList):
new_df = df[contList]
distDf = pd.DataFrame(index=contList,columns=['rows','posCnt','zeroCnt','negCnt'])
m,n = new_df.shape
for j in range(n):
ser = new_df.iloc[:,j]
name = new_df.columns[j]
posCnt = ser[ser>0].count()
zeroCnt = ser[ser==0].count()
negCnt = ser[ser<0].count()
distDf.loc[name,'rows'] = m
distDf.loc[name,'posCnt'] = posCnt
distDf.loc[name,'zeroCnt'] = zeroCnt
distDf.loc[name,'negCnt'] = negCnt
return distDf
Execution and result:
outCheck(dataset,contColList)
Out[101]:
max min mean std cov above3SegmaCnt below3SegmaCnt aboveBoxCnt belowBoxCnt
col1 110.0000 101.0 105.500000 3.027650 0.028698 0 0 0 0
col2 58.0000 20.0 34.444444 11.959422 0.347209 0 0 1 0
col3 221.0000 10.0 87.700000 71.030588 0.809927 0 0 0 0
col5 598.0000 0.0 246.333333 235.303647 0.955225 0 0 0 0
col6 0.0115 -0.3 -0.027740 0.095759 -3.452026 0 0 2 1
distCheck(dataset,contColList)
Out[102]:
rows posCnt zeroCnt negCnt
col1 10 10 0 0
col2 10 9 0 0
col3 10 10 0 0
col5 10 7 2 0
col6 10 3 6 1