python learning process based on simple data

PANDAS AS pd Import 
df = pd.read_excel ( 'E: // // Chaoyang Hospital, the Data 2018 sales data .xlsx') 
df.head () # print first five 
df.shape # How many rows, how many columns 
df.dtypes # Check each column data types 
colNameDict = { 'consumers in time': 'time sales'} 
# column rename 
df.rename (columns = colNameDict, inplace = True) # sales to consumers in time to time inplace = true They said they did not create a new object to make changes directly to the original data, false representation to create new data objects 
df.head () 


# missing values handling 
print ( 'delete size before missing values', df.shape) 
df = df .dropna (subset = [ 'selling time', 'social security number'], how = 'any' ) # delete column (sales period, social security number) is empty rows, where how = 'any' meaning given column any delete a null value in the line 
print ( 'missing values after deleting size', df.shape) 

# data type conversion 
df [ 'sales'] = df [ 'sales'] .astype ( 'float' ) 
DF [ 'amount receivable'] = df [ 'amount receivable'] .astype ( 'float') 
DF [ 'social security number'] = df [ 'social security number'] .astype ( 'ocject') 
## Modified Date
splitsaletime DEF (timeColser): 
    timelist = [] 
    for timeColser in value: for example, the # 2018-01-01 Friday, is divided into: 2018-01-01 
        dateStr = value.split ( '') [0] 
        timelist.append (dateStr ) 

    timeSer = pd.Series (timelist) # forwarding behavior list type one-dimensional data series 
    return timeSer 

## input: timecolser-- selling time this column, a series of data types 
## outputs: the divided time, the data is returned type 

timeSer = df.loc [:, 'time sales'] # Get the sales period of this column 
dateSer = splitsaletime (timeSer) # string is divided, obtaining sales date 

df.loc [:, 'time sales'] = dateSer # this row modification time sales value 
df.head () 

# string conversion date 

df.loc [:, 'time sales'] = pd.to_datetime (df.loc [:, ' selling time '], format ='% Y - M-% D% ', errors =' coerce ') 
df.dtypes 


DF = df.dropna (Subset = [' selling time ','Social security number '], = How' the any ') 


## data ordering
Sorting method using pd.sort.values ####, by showing that sorted by column in ascending order indicates when ascending = true, false indicates descending order 
print ( 'data before sorting') 
df.head () 
DF = df.sort_values (by = 'selling time', Ascending = True) 
( 'sorted data') Print 
df.head () 

after ## ,, mess sorted row number needed to be re sort 
df = df.reset_index (drop = True) 
df.head () 


handle outliers ## ,, first, I () describe ways to view data box with all the data in each column describing statistics, 
df.dedscribe () # can be seen from the results of sales there may be a negative result,, the number of recording errors, remove 

# delete by deleting condition determination 

queryser = df.loc [:, 'sales']> 0 # set query 
print ( 'before outliers deleted', df.shape ) 
DF = df.loc [queryser ,:] # conditions application 
'after remove outliers' print (, df.shape)

  

Guess you like

Origin www.cnblogs.com/manjianlei/p/11299964.html