The data pre-cleaning

'' ' 
    Cleaning data 
            1. The process is repeated with the detection value 
            2 detecting missing values are 
            3 detects the abnormal value processing 
' '' 
Import numpy AS NP
 Import PANDAS PD AS 

# deduplication: A column according to weight, while retaining and the first data in the original table to re 
data = pd.DataFrame ({ ' a ' : [. 1,. 1, 2, 2], ' B ' : [ ' a ' , ' B ' , ' a ' , ' B ' ]})
 Print (Data)
 # of columns A deduplication 
# data.drop_duplicates ( 'A', 'first', inplace=True)
# Print (Data) 
# of all columns to re 
# data.drop_duplicates ( 'First', InPlace = True) 
# Print (Data) 
# detect duplicate values: default in rows 
Duplicates = data.duplicated ()
 Print (Duplicates)
 # Detection duplicate values: in columns 
duplicates data.duplicated = ([ ' A ' ])
 Print (duplicates) 

# handling missing values 
# delete method 
a dates = pd.date_range ( ' 20,130,101 ' , = periods. 6 ) 
DF = pd.DataFrame (NP. aRange (24) .reshape ((. 6,. 4)), index = a dates, Columns = [ ' A ' ,' B ' , ' C ' , ' D ' ]) 
df.iloc [0, . 1] = np.nan 
df.iloc [ . 1, 2] = np.nan
 Print (DF) 
DF1 = df.dropna (Axis = 0 , How = ' the any ' , InPlace = False)
 Print (DF1)
 # alternative method 
DF2 = df.fillna (value = 0)
 Print (DF2)
 # determines whether a missing value 
DF3 = df.isnull ()
 Print (DF3)
 # statistics appeared in a total number of missing values 
Print (df3.sum ())   #The number of missing values in each column (in the form of default) 
Print (df3.sum (Axis =. 1))   # each row number of missing values 
Print (df3.sum (). SUM ())   # data overall number of missing values appear 
# Analyzing if there are missing data in the entire data 
DF4 = np.any (df.isnull ()) == True
 Print (DF4) 


output 
   AB 
0   . 1   A
 . 1. 1   B
 2 2   A
 . 3 2   B 
0 False
 . 1     False
 2     False
 . 3     False 
DTYPE : BOOL 
0 False
 . 1      True
 2     False
 . 3      True 
DTYPE: BOOL
             ABCD
 2013-01-01 0 NaN 3 2.0 
2013-01-02 4 5.0 NaN 7 
2013-01-03 8 9.0 11 10.0 
2013-01-04 12 13.0 14.0 15 
2013-01-05 16 17.0 18.0 19 
2013-01- 06 20 21.0 22.0 23 
             ABCD
 2013-01-03 8 9.0 11 10.0 
2013-01-04 12 13.0 14.0 15 
2013-01-05 16 17.0 18.0 19 
2013-01-06 20 21.0 22.0 23 
             ABCD
 2013-01-01 0 0.0 3 2.0 
2013-01-02 4 5.0 0.0 7 
2013-01-03 8 9.0 11 10.0 
2013-01-04 12 13.0 14.0 15 
2013-01-05 16 17.0 18.0 19 
2013-01-06 20 21.0 22.0 23
                A      B      C      D
2013-01-01  False   True  False  False
2013-01-02  False  False   True  False
2013-01-03  False  False  False  False
2013-01-04  False  False  False  False
2013-01-05  False  False  False  False
2013-01-06  False  False  False  False
A    0
B    1
C    1
D    0
dtype: int64
2013-01-01    1
2013-01-02    1
2013-01-03    0
2013-01-04    0
2013-01-05    0
2013-01-06    0
Freq: D, dtype: int64
2
True
'' ' 
    Data cleaning interpolation of missing values 
' '' 
Import numpy AS NP
 from scipy.interpolate Import interp1d
 Import matplotlib.pyplot AS MP 

# creating interpolated data to be 
x = np.linspace (0, 10 * np.pi, 20 ) 
Y = np.cos (x)
 # , respectively, using linear and quadratic interpolation 
F1 = interp1d (x, Y, kind = ' linear ' ) 
FQ = interp1d (x, Y, kind = ' quadratic ' )
 # set maximum value of x interpolation data and the minimum value to prevent cross-border 
XINT = np.linspace (x.min (), x.max (), 1000 ) 
yint1 = = F1 (XINT)
yintqFQ (XINT)
 # linear interpolation and second order interpolation effect 
mp.plot (X, Y, Color = ' Pink ' ) 
mp.plot (XINT, yint1, Color = ' Blue ' , label = ' Linear ' ) 
mp.plot ( XINT, yintq, Color = ' OrangeRed ' , label = ' for a Quadratic ' ) 
mp.legend () 
mp.show ()

  

Guess you like

Origin www.cnblogs.com/yuxiangyang/p/11286536.html