PANDAS AS pd Import df = pd.read_excel ( 'E: // // Chaoyang Hospital, the Data 2018 sales data .xlsx') df.head () # print first five df.shape # How many rows, how many columns df.dtypes # Check each column data types colNameDict = { 'consumers in time': 'time sales'} # column rename df.rename (columns = colNameDict, inplace = True) # sales to consumers in time to time inplace = true They said they did not create a new object to make changes directly to the original data, false representation to create new data objects df.head () # missing values handling print ( 'delete size before missing values', df.shape) df = df .dropna (subset = [ 'selling time', 'social security number'], how = 'any' ) # delete column (sales period, social security number) is empty rows, where how = 'any' meaning given column any delete a null value in the line print ( 'missing values after deleting size', df.shape) # data type conversion df [ 'sales'] = df [ 'sales'] .astype ( 'float' ) DF [ 'amount receivable'] = df [ 'amount receivable'] .astype ( 'float') DF [ 'social security number'] = df [ 'social security number'] .astype ( 'ocject') ## Modified Date splitsaletime DEF (timeColser): timelist = [] for timeColser in value: for example, the # 2018-01-01 Friday, is divided into: 2018-01-01 dateStr = value.split ( '') [0] timelist.append (dateStr ) timeSer = pd.Series (timelist) # forwarding behavior list type one-dimensional data series return timeSer ## input: timecolser-- selling time this column, a series of data types ## outputs: the divided time, the data is returned type timeSer = df.loc [:, 'time sales'] # Get the sales period of this column dateSer = splitsaletime (timeSer) # string is divided, obtaining sales date df.loc [:, 'time sales'] = dateSer # this row modification time sales value df.head () # string conversion date df.loc [:, 'time sales'] = pd.to_datetime (df.loc [:, ' selling time '], format ='% Y - M-% D% ', errors =' coerce ') df.dtypes DF = df.dropna (Subset = [' selling time ','Social security number '], = How' the any ') ## data ordering Sorting method using pd.sort.values ####, by showing that sorted by column in ascending order indicates when ascending = true, false indicates descending order print ( 'data before sorting') df.head () DF = df.sort_values (by = 'selling time', Ascending = True) ( 'sorted data') Print df.head () after ## ,, mess sorted row number needed to be re sort df = df.reset_index (drop = True) df.head () handle outliers ## ,, first, I () describe ways to view data box with all the data in each column describing statistics, df.dedscribe () # can be seen from the results of sales there may be a negative result,, the number of recording errors, remove # delete by deleting condition determination queryser = df.loc [:, 'sales']> 0 # set query print ( 'before outliers deleted', df.shape ) DF = df.loc [queryser ,:] # conditions application 'after remove outliers' print (, df.shape)