. 1 Import PANDAS AS PD 2 . 3 # loading data . 4 Data = pd.read_csv ( " ./order.csv " , encoding = " ANSI " ) . 5 # Print ( "Data: \ n-", Data) . 6 Print ( " Data of column index: \ n- " , data.columns) . 7 . 8 # sales volume in the presence of 0, or sales <0 data 9 # delete the incorrect data 10 # - reserved sales> 0's data . 11 bool_index = data. LOC [:, " sales " ]> 0 12 is Data =data.loc [bool_index,:] 13 14 # 1, which categories of goods more popular? 15 # accordance id packet category, sales statistics SUM 16 # sort_values sorted in ascending order by default . 17 # by the specified data sorted by a column 18 is # Ascending ascending = True . 19 RES = data.groupby (by = " category ID " ) [ " sales " ] .sum (). sort_values (Ascending = False) .head (10 ) 20 is RES = pd.pivot_table ( 21 is Data = Data, 22 is index = " category ID " , 23 is #columns = "category ID", 24 values = " sales " , 25 aggfunc = " SUM " 26 is ) .sort_values (by = " sales " , Ascending = False) .head (10 ) 27 Print ( " RES: \ n- " , RES) 28 29 # 2, which goods more popular 30 RES = data.groupby (by = " product ID " ) [ " sales " ] .sum (). sort_values (Ascending = False) .head (10 ) 31 is RES = PD .pivot_table ( 32 = Data Data, 33 is index = " product ID " , 34 is values = " sales " , 35 aggfunc = " SUM " 36 ) .sort_values (by = " sales " , Ascending = False) .head (10 ) 37 [ Print ( " RES : \ n- " , RES) 38 is 39 # . 3, find different stores sales accounting 40 # (1) to calculate the sales of each product 41 is data.loc [:, " sales / single item " ] = Data .loc [:, "unit price" ] * Data.loc [:, " sales " ] 42 is 43 is # (2) according to grouping numbers stores, sum of sales statistics for each item, and 44 is ALL_ = data.groupby (by = " shop ID " ) [ " sales / single item " ] .sum () 45 46 is Print (ALL_) 47 # (. 3) calculating the proportion of 48 Print ( " accounting for all stores is: " , (ALL_ / all_.sum ()) Apply. ( the lambda the X-: format (the X-, " .2% " ))) 49 50 51 # 4, which is the peak passenger flow period of the supermarket? 52 #(1) Since the goods corresponding to a plurality of order ID, order ID for each of a person must first be de-duplication of the ID 53 # data deduplication 54 is # Subset need to re column 55 # InPlace = True modify the original data 56 data.drop_duplicates (Subset = ' Order ID ' , InPlace = True) 57 is Print ( " data after deduplication: \ n- " , data) 58 59 # (2) acquires attribute hours 60 data.loc [:, " transaction time " ] = pd.to_datetime (data.loc [:, " turnover time " ]) 61 is 62 is data.loc [:, " hour "] = [I.hour for I in data.loc [:, " turnover time " ]] 63 is 64 Print (Data) 65 66 # (. 3) in accordance with the packet hours, count the number of hours in each order 67 RES = Data. GroupBy (= by " hour " ) [ " order ID " .] .count () sort_values (Ascending = False) 68 69 Print ( " RES: \ n- " , RES) 70 71 is # sort_index () in accordance with the size of the row index sequence
Data Analysis & Data Mining [supermarket chain] Data Analysis Case
Guess you like
Origin www.cnblogs.com/Tree0108/p/12116085.html
Recommended
Ranking