'' ' Cleaning data 1. The process is repeated with the detection value 2 detecting missing values are 3 detects the abnormal value processing ' '' Import numpy AS NP Import PANDAS PD AS # deduplication: A column according to weight, while retaining and the first data in the original table to re data = pd.DataFrame ({ ' a ' : [. 1,. 1, 2, 2], ' B ' : [ ' a ' , ' B ' , ' a ' , ' B ' ]}) Print (Data) # of columns A deduplication # data.drop_duplicates ( 'A', 'first', inplace=True) # Print (Data) # of all columns to re # data.drop_duplicates ( 'First', InPlace = True) # Print (Data) # detect duplicate values: default in rows Duplicates = data.duplicated () Print (Duplicates) # Detection duplicate values: in columns duplicates data.duplicated = ([ ' A ' ]) Print (duplicates) # handling missing values # delete method a dates = pd.date_range ( ' 20,130,101 ' , = periods. 6 ) DF = pd.DataFrame (NP. aRange (24) .reshape ((. 6,. 4)), index = a dates, Columns = [ ' A ' ,' B ' , ' C ' , ' D ' ]) df.iloc [0, . 1] = np.nan df.iloc [ . 1, 2] = np.nan Print (DF) DF1 = df.dropna (Axis = 0 , How = ' the any ' , InPlace = False) Print (DF1) # alternative method DF2 = df.fillna (value = 0) Print (DF2) # determines whether a missing value DF3 = df.isnull () Print (DF3) # statistics appeared in a total number of missing values Print (df3.sum ()) #The number of missing values in each column (in the form of default) Print (df3.sum (Axis =. 1)) # each row number of missing values Print (df3.sum (). SUM ()) # data overall number of missing values appear # Analyzing if there are missing data in the entire data DF4 = np.any (df.isnull ()) == True Print (DF4) output AB 0 . 1 A . 1. 1 B 2 2 A . 3 2 B 0 False . 1 False 2 False . 3 False DTYPE : BOOL 0 False . 1 True 2 False . 3 True DTYPE: BOOL ABCD 2013-01-01 0 NaN 3 2.0 2013-01-02 4 5.0 NaN 7 2013-01-03 8 9.0 11 10.0 2013-01-04 12 13.0 14.0 15 2013-01-05 16 17.0 18.0 19 2013-01- 06 20 21.0 22.0 23 ABCD 2013-01-03 8 9.0 11 10.0 2013-01-04 12 13.0 14.0 15 2013-01-05 16 17.0 18.0 19 2013-01-06 20 21.0 22.0 23 ABCD 2013-01-01 0 0.0 3 2.0 2013-01-02 4 5.0 0.0 7 2013-01-03 8 9.0 11 10.0 2013-01-04 12 13.0 14.0 15 2013-01-05 16 17.0 18.0 19 2013-01-06 20 21.0 22.0 23 A B C D 2013-01-01 False True False False 2013-01-02 False False True False 2013-01-03 False False False False 2013-01-04 False False False False 2013-01-05 False False False False 2013-01-06 False False False False A 0 B 1 C 1 D 0 dtype: int64 2013-01-01 1 2013-01-02 1 2013-01-03 0 2013-01-04 0 2013-01-05 0 2013-01-06 0 Freq: D, dtype: int64 2 True
'' ' Data cleaning interpolation of missing values ' '' Import numpy AS NP from scipy.interpolate Import interp1d Import matplotlib.pyplot AS MP # creating interpolated data to be x = np.linspace (0, 10 * np.pi, 20 ) Y = np.cos (x) # , respectively, using linear and quadratic interpolation F1 = interp1d (x, Y, kind = ' linear ' ) FQ = interp1d (x, Y, kind = ' quadratic ' ) # set maximum value of x interpolation data and the minimum value to prevent cross-border XINT = np.linspace (x.min (), x.max (), 1000 ) yint1 = = F1 (XINT) yintqFQ (XINT) # linear interpolation and second order interpolation effect mp.plot (X, Y, Color = ' Pink ' ) mp.plot (XINT, yint1, Color = ' Blue ' , label = ' Linear ' ) mp.plot ( XINT, yintq, Color = ' OrangeRed ' , label = ' for a Quadratic ' ) mp.legend () mp.show ()