Data Preparation <2>: Data Quality Check - Actual Combat

Previous article: "Data Quality Inspection - Theory" mainly introduces the basic ideas and methods of data quality inspection. As a supplement, this article provides specific implementation methods from the perspective of Python actual combat.
Continuing the above, we will still describe from the four aspects of duplicate value checking, missing value checking, data skew problem, and outlier checking.

1. Environment introduction

Version: python2.7
Tools: Spyder
Developer: hbsygfz

2. Dataset introduction

Dataset: dataset.xlsx

3. Code implementation

3.1 Import related libraries

import pandas as pd

3.2 Read the dataset

dataset = pd.read_excel("/labcenter/python/dataset.xlsx")
discColList = ['col4','col7']
contColList = ['col1','col2','col3','col5','col6']

3.3 Duplicate value check

Main statistical indicators: the number of duplicate records, the number of unique field values.

### (1)重复记录数
def dupRowsCheck(df):
    dupRows = df.duplicated().sum()
    return dupRows

### (2)字段唯一值数
def uiqColValCheck(df):
    # 记录数,变量数
    m,n = df.shape
    uiqDf = pd.DataFrame(index=df.columns,columns=['rows','uiqCnt'])
    uiqDf['rows'] = m
    for j in range(n):
        ser = df.iloc[:,j]
        name = df.columns[j]
        uiqCnt = len(ser.unique())
        uiqDf.loc[name,'uiqCnt'] = uiqCnt
    return uiqDf

Execution and result:

dupRowsCheck(dataset)
Out[95]: 0
uiqColValCheck(dataset)
Out[96]: 
      rows uiqCnt
col1    10     10
col2    10      9
col3    10     10
col4    10      3
col5    10      9
col6    10      5
col7    10      2

3.4 Missing value checking

Main statistical indicators: the number of records with null values in the field.

def missingCheck(df):
    # 记录数,变量数
    m,n = df.shape
    rowsSer = pd.Series(index=df.columns)
    rowsSer.name = 'rows'
    # 空值记录数
    nullCntSer = df.isnull().sum()
    nullCntSer.name = 'nullCnt'
    # 合并结果
    missDf = pd.concat([rowsSer,nullCntSer],axis=1)
    missDf['rows'] = m
    return missDf

Execution and result:

missingCheck(dataset)
Out[97]: 
      rows  nullCnt
col1    10        0
col2    10        1
col3    10        0
col4    10        0
col5    10        1
col6    10        0
col7    10        0

3.5 Data skew problem

Main statistical indicators: the number of records, the number of categories, the maximum number of category records, and the proportion of the maximum category records.

def skewCheck(df,discList,contList,bins):
    # 离散型变量类别统计
    new_df1 = df[discList]
    skewDf1 = pd.DataFrame(index=discList,columns=['rows','classCnt','mostClassCnt','mostClassRio'])
    m1,n1 = new_df1.shape
    for j in range(n1):
        ser = new_df1.iloc[:,j]
        name = new_df1.columns[j]
        freqSer = pd.value_counts(ser)
        skewDf1.loc[name,'rows'] = m1
        skewDf1.loc[name,'classCnt'] = len(freqSer)
        skewDf1.loc[name,'mostClassCnt'] = freqSer[0] 
        skewDf1.loc[name,'mostClassRio'] = freqSer[0] * 1.00 / m1
    # 连续型变量分箱统计
    new_df2 = df[contList]
    skewDf2 = pd.DataFrame(index=contList,columns=['rows','classCnt','mostClassCnt','mostClassRio'])
    m2,n2 = new_df2.shape
    for j in range(n2):
        ser = new_df2.iloc[:,j]
        name = new_df2.columns[j]
        freqSer = pd.value_counts(pd.cut(ser,bins))
        skewDf2.loc[name,'rows'] = m2
        skewDf2.loc[name,'classCnt'] = len(freqSer)
        skewDf2.loc[name,'mostClassCnt'] = freqSer[0] 
        skewDf2.loc[name,'mostClassRio'] = freqSer[0] * 1.00 / m2
    # 合并结果
    skewDf = pd.concat([skewDf1,skewDf2],axis=0)
    return skewDf

Execution and result:

skewCheck(dataset,discColList,contColList,4)
Out[98]: 
     rows classCnt mostClassCnt mostClassRio
col4   10        3            5          0.5
col7   10        2            6          0.6
col1   10        4            3          0.3
col2   10        4            3          0.3
col3   10        4            4          0.4
col5   10        4            3          0.3
col6   10        4            1          0.1

3.6 Outlier Checking

Main statistical indicators: maximum, minimum, mean, standard deviation, coefficient of variation, the number of records greater than the mean + 3 times the standard deviation, the number of records less than the mean - 3 times the standard deviation, greater than the upper quartile + 1.5 times The number of records in the interquartile range, the number of records in the interquartile range less than -1.5 times the lower quartile, the number of positive records, the number of zero records, and the number of negative records.

### (1)异常值统计
def outCheck(df,contList):
    new_df = df[contList]
    resDf = new_df.describe()
    resDf.loc['cov'] = resDf.loc['std'] / resDf.loc['mean']     #计算变异系数
    resDf.loc['mean+3std'] = resDf.loc['mean'] + 3 * resDf.loc['std']  #计算平均值+3倍标准差
    resDf.loc['mean-3std'] = resDf.loc['mean'] - 3 * resDf.loc['std']  #计算平均值-3倍标准差
    resDf.loc['75%+1.5dist'] = resDf.loc['75%'] + 1.5 * (resDf.loc['75%'] - resDf.loc['25%'])  #计算上四分位+1.5倍的四分位间距
    resDf.loc['25%-1.5dist'] = resDf.loc['25%'] - 1.5 * (resDf.loc['75%'] - resDf.loc['25%'])  #计算下四分位-1.5倍的四分位间距
    # 3segma检查
    segmaSer1 = new_df[new_df > resDf.loc['mean+3std']].count()    #平均值+3倍标准差
    segmaSer1.name = 'above3SegmaCnt'
    segmaSer2 = new_df[new_df < resDf.loc['mean-3std']].count()    #平均值-3倍标准差
    segmaSer2.name = 'below3SegmaCnt'
    # 箱线图检查
    boxSer1 = new_df[new_df > resDf.loc['75%+1.5dist']].count()    #上四分位+1.5倍的四分位间距 
    boxSer1.name = 'aboveBoxCnt'
    boxSer2 = new_df[new_df < resDf.loc['25%-1.5dist']].count()    #下四分位-1.5倍的四分位间距
    boxSer2.name = 'belowBoxCnt'
    # 合并结果
    outTmpDf1 = pd.concat([segmaSer1,segmaSer2,boxSer1,boxSer2],axis=1)
    outTmpDf2 = resDf.loc[['max','min','mean','std','cov']]
    outDf = pd.concat([outTmpDf2.T,outTmpDf1],axis=1)
    return outDf
    
### (2)正负分布检查
def distCheck(df,contList):
    new_df = df[contList]
    distDf = pd.DataFrame(index=contList,columns=['rows','posCnt','zeroCnt','negCnt'])
    m,n = new_df.shape
    for j in range(n):
        ser = new_df.iloc[:,j]
        name = new_df.columns[j]
        posCnt = ser[ser>0].count()
        zeroCnt = ser[ser==0].count()
        negCnt = ser[ser<0].count()
        distDf.loc[name,'rows'] = m
        distDf.loc[name,'posCnt'] = posCnt
        distDf.loc[name,'zeroCnt'] = zeroCnt
        distDf.loc[name,'negCnt'] = negCnt
    return distDf

Execution and result:

outCheck(dataset,contColList)
Out[101]: 
           max    min        mean         std       cov  above3SegmaCnt  below3SegmaCnt  aboveBoxCnt  belowBoxCnt
col1  110.0000  101.0  105.500000    3.027650  0.028698               0               0            0            0
col2   58.0000   20.0   34.444444   11.959422  0.347209               0               0            1            0
col3  221.0000   10.0   87.700000   71.030588  0.809927               0               0            0            0
col5  598.0000    0.0  246.333333  235.303647  0.955225               0               0            0            0
col6    0.0115   -0.3   -0.027740    0.095759 -3.452026               0               0            2            1
distCheck(dataset,contColList)
Out[102]: 
     rows posCnt zeroCnt negCnt
col1   10     10       0      0
col2   10      9       0      0
col3   10     10       0      0
col5   10      7       2      0
col6   10      3       6      1