Data Analysis & Data Mining [supermarket chain] Data Analysis Case

. 1  Import PANDAS AS PD
 2  
. 3  # loading data 
. 4 Data = pd.read_csv ( " ./order.csv " , encoding = " ANSI " )
 . 5  # Print ( "Data: \ n-", Data) 
. 6  Print ( " Data of column index: \ n- " , data.columns)
 . 7  
. 8  # sales volume in the presence of 0, or sales <0 data 
9  # delete the incorrect data 
10  # - reserved sales> 0's data 
. 11 bool_index = data. LOC [:, " sales " ]> 0
 12 is Data =data.loc [bool_index,:]
 13  
14  # 1, which categories of goods more popular? 
15  # accordance id packet category, sales statistics SUM 
16  # sort_values sorted in ascending order by default 
. 17  # by the specified data sorted by a column 
18 is  # Ascending ascending = True 
. 19 RES = data.groupby (by = " category ID " ) [ " sales " ] .sum (). sort_values (Ascending = False) .head (10 )
 20 is RES = pd.pivot_table (
 21 is      Data = Data,
 22 is      index = " category ID " ,
 23 is      #columns = "category ID", 
24      values = " sales " ,
 25      aggfunc = " SUM " 
26 is ) .sort_values (by = " sales " , Ascending = False) .head (10 )
 27  Print ( " RES: \ n- " , RES)
 28  
29  # 2, which goods more popular 
30 RES = data.groupby (by = " product ID " ) [ " sales " ] .sum (). sort_values (Ascending = False) .head (10 )
 31 is RES = PD .pivot_table (
 32     = Data Data,
 33 is      index = " product ID " ,
 34 is      values = " sales " ,
 35      aggfunc = " SUM " 
36 ) .sort_values (by = " sales " , Ascending = False) .head (10 )
 37 [  Print ( " RES : \ n- " , RES)
 38 is  
39  # . 3, find different stores sales accounting 
40  # (1) to calculate the sales of each product 
41 is data.loc [:, " sales / single item " ] = Data .loc [:, "unit price" ] * Data.loc [:, " sales " ]
 42 is  
43 is  # (2) according to grouping numbers stores, sum of sales statistics for each item, and 
44 is ALL_ = data.groupby (by = " shop ID " ) [ " sales / single item " ] .sum ()
 45  
46 is  Print (ALL_)
 47  # (. 3) calculating the proportion of 
48  Print ( " accounting for all stores is: " , (ALL_ / all_.sum ()) Apply. ( the lambda the X-: format (the X-, " .2% " )))
 49  
50  
51  # 4, which is the peak passenger flow period of the supermarket? 
52  #(1) Since the goods corresponding to a plurality of order ID, order ID for each of a person must first be de-duplication of the ID 
53  # data deduplication 
54 is  # Subset need to re column 
55  # InPlace = True modify the original data 
56 data.drop_duplicates (Subset = ' Order ID ' , InPlace = True)
 57 is  Print ( " data after deduplication: \ n- " , data)
 58  
59  # (2) acquires attribute hours 
60 data.loc [:, " transaction time " ] = pd.to_datetime (data.loc [:, " turnover time " ])
 61 is  
62 is data.loc [:, " hour "] = [I.hour for I in data.loc [:, " turnover time " ]]
 63 is  
64  Print (Data)
 65  
66  # (. 3) in accordance with the packet hours, count the number of hours in each order 
67 RES = Data. GroupBy (= by " hour " ) [ " order ID " .] .count () sort_values (Ascending = False)
 68  
69  Print ( " RES: \ n- " , RES)
 70  
71 is  # sort_index () in accordance with the size of the row index sequence

Guess you like

Origin www.cnblogs.com/Tree0108/p/12116085.html