数据分析处理库-pandas

pandas本操作

python-3.7　　pycharm　　pandas-0.23.4

  1 """
  2     数据分析处理库-pandas
  3     时间：2018\9\12 0012
  4     pandas是基于numpy的封装库
  5 """
  6 import pandas as pd
  7 import numpy as np
  8 from pandas import Series
  9 
 10 print("""\n---------------------------------读取文件---------------------------------------------
 11 -----------------------读取文件函数read_csv("文件路径")------------------------------------\n""")
 12 food_info = pd.read_csv("food_info.csv")
 13 print(type(food_info))  # DataFrame是pandas的核心结构
 14 print(food_info.dtypes)  # 数据类型结构 字符叫做object，python中叫做str
 15 # print(help(pd.read_csv))
 16 
 17 
 18 print("""\n---------------------------------数据操作---------------------------------------------
 19 -----------------------显示前N条数据(行)food_info.head(条数[默认5行])------------------------------------\n""")
 20 print(food_info.head(3))  # 默认5条，参数是条数
 21 
 22 print("""\n---------------------------------数据操作---------------------------------------------
 23 -----------------------显示尾N条数据(行)food_info.tail(条数[默认5行])------------------------------------\n""")
 24 print(food_info.tail(4))  # 默认5条，参数是条数
 25 
 26 print("""\n---------------------------------数据操作---------------------------------------------
 27 ---------------------------显示列名food_info.columns------------------------------------\n""")
 28 print(food_info.columns)
 29 
 30 print("""\n---------------------------------数据操作---------------------------------------------
 31 ---------------------------查看矩阵的行列数food_info.shape------------------------------------\n""")
 32 print(food_info.shape)
 33 
 34 print("""\n---------------------------------数据操作---------------------------------------------
 35 ---------------------------根据索引取出数据food_info.loc[索引]------------------------------------\n""")
 36 print(food_info.loc[0])
 37 print("""\n---------------------------------数据操作---------------------------------------------
 38 -----------------根据列名取出数据food_info["列名","列名"]或者food_info["列名"]--------------------------\n""")
 39 ndb_col = food_info['NDB_No']
 40 print(ndb_col)
 41 
 42 ndb_col2 = food_info[["NDB_No", "Energ_Kcal"]]  # 传入list结构
 43 print(ndb_col2)
 44 
 45 print("""\n---------------------------------数据操作---------------------------------------------
 46 ------------------------------------寻找以g为结尾的列名--------------------------\n""")
 47 col_names = food_info.columns.tolist()
 48 print(col_names)
 49 gram_columns = []
 50 
 51 for c in col_names:
 52     if c.endswith(('(g)')):
 53         gram_columns.append(c)
 54 gram_df = food_info[gram_columns]
 55 print(gram_df.head(3))
 56 
 57 print("""\n---------------------------------数据操作---------------------------------------------
 58 ------------------------------------数据除以常数--------------------------\n""")
 59 print(food_info['Iron_(mg)'])  # 根据列名打印
 60 div_1000 = food_info['Iron_(mg)'] / 1000  # 每个元素除以1000
 61 print(div_1000)
 62 
 63 print("""\n---------------------------------数据操作---------------------------------------------
 64 ------------------------------------元素对应位置相乘，表格增加列(维度相同)--------------------------\n""")
 65 water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
 66 water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
 67 iron_grams = food_info["Iron_(mg)"] / 1000
 68 print(food_info.shape)
 69 food_info["Iron_(g)"] = iron_grams
 70 print(food_info.shape)
 71 
 72 print("""\n---------------------------------数据操作---------------------------------------------
 73 ------------------------------------元素对应位置加减乘除--------------------------\n""")
 74 weighted_protein = food_info["Protein_(g)"] * 2
 75 weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
 76 initial_rating = weighted_protein + weighted_fat
 77 print(initial_rating.head())
 78 
 79 print("""\n---------------------------------数据操作---------------------------------------------
 80 ------------------------------------最大值，最小值，均值-----------------------------------------------\n""")
 81 max_calories = food_info["Energ_Kcal"].max()
 82 normalized_calories = food_info["Energ_Kcal"] / max_calories
 83 normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
 84 normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
 85 food_info["Normalized_Protein"] = normalized_protein
 86 food_info["Normalized_Fat"] = normalized_fat
 87 print(food_info["Normalized_Protein"].head())
 88 print(food_info["Normalized_Fat"].head())
 89 
 90 print("""\n---------------------------------数据操作---------------------------------------------
 91 ------------------------------------排序----------------------------------------------------------\n""")
 92 food_info.sort_values("Sodium_(mg)", inplace = True)  # inplace = True在原处排序，False生成新的DataFrame
 93 print(food_info["Sodium_(mg)"])
 94 food_info.sort_values("Sodium_(mg)", inplace = True, ascending = False)  # ascending升序排列
 95 print(food_info["Sodium_(mg)"])
 96 
 97 print("""\n---------------------------------数据操作---------------------------------------------
 98 ------------------------------------泰坦尼克号获救------------------------------------------------\n""")
 99 
100 titanic_survival = pd.read_csv('titanic_train.csv')
101 print(titanic_survival.head())
102 age = titanic_survival['Age']
103 print(age.loc[0:10])  # 查看年龄列前10条
104 age_is_null = pd.isnull(age)  # 判断年龄列为空值
105 print(age_is_null)
106 age_null_true = age[age_is_null]  # 将空年龄的索引值挑出来
107 print(age_null_true)
108 age_null_count = len(age_null_true)  # 看空年龄的总数
109 print(age_null_count)
110 
111 good_ages = titanic_survival["Age"][age_is_null == False]  # 挑出年龄列中年龄不为空值的部分
112 print(good_ages)
113 correct_mean_age = sum(good_ages) / len(good_ages)  # 计算平均年龄
114 print(correct_mean_age)
115 print("""\n---------------------------------数据操作---------------------------------------------
116 ------------------------------------泰坦尼克号获救----现成的均值函数mean()---------------------------------\n""")
117 correct_mean_age = titanic_survival['Age'].mean()
118 print(correct_mean_age)
119 
120 print("""\n---------------------------------数据操作---------------------------------------------
121 ------------------------------------泰坦尼克号获救----不同等级仓位的平均价格---------------------------------\n""")
122 passenger_class = [1, 2, 3]
123 fares_by_class = {}
124 for this_class in passenger_class:
125     pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]  # 取出1（2,3）等舱所有行
126     pclass_fares = pclass_rows["Fare"]  # 取出所有1（2,3）等舱所有价格
127     fare_for_class = pclass_fares.mean()  # 求1（2,3）等舱均值
128     fares_by_class[this_class] = fare_for_class  # 1，（2,3）等舱均值加入字典
129 print(fares_by_class)
130 print("""\n---------------------------------数据操作---------------------------------------------
131 -----------泰坦尼克号获救----不同数据关联求值titanic_survival.pivot_table()---------------------\n""")
132 print("""---------index:以谁为基准--values：统计index跟什么之间的关系-----aggfunc：什么关系-------------""")
133 passenger_survival = titanic_survival.pivot_table(index = "Pclass", values = "Survived", aggfunc = np.mean)
134 print(passenger_survival)
135 print("""---------看仓位与年龄的关系-------------""")
136 passenger_age = titanic_survival.pivot_table(index = 'Pclass', values = 'Age')  # 此处没有指定aggfunc，默认以求均值的方法
137 print(passenger_age)
138 print("""---------看登船地点与（船票价格和获救）的关系-------------""")
139 port_stats = titanic_survival.pivot_table(index = "Embarked", values = ["Fare", "Survived"], aggfunc = np.sum)
140 print(port_stats)
141 print("""---------丢弃缺失值--dropna(axis = 1行0列)-----------""")
142 drop_na_columns = titanic_survival.dropna(axis = 1)
143 new_titanic_survival = titanic_survival.dropna(axis = 0, subset = ['Age', 'Sex'])  # 只看age和sex列
144 print(new_titanic_survival)
145 print("""---------定位到具体值-----------""")
146 row_index_83_age = titanic_survival.loc[83, 'Age']  # [行，列名]
147 row_index_1000_Pclass = titanic_survival.loc[766, 'Pclass']
148 print(row_index_83_age)
149 print(row_index_1000_Pclass)
150 print("""---------按照某数据排序后，重新排序index-----------""")
151 new_titanic_survival = titanic_survival.sort_values('Age', ascending = False)
152 print(new_titanic_survival[0:10])
153 print('-----------')
154 titanic_reindex = new_titanic_survival.reset_index(drop = True)  # 原来的不要了
155 print(titanic_reindex[0:10])
156 print("""---------apply函数-----------""")
157 
158 
159 def hundredth_row(column):
160     hundredth_item = column.loc[99]
161     return hundredth_item
162 
163 
164 hundredth_row_is = titanic_survival.apply(hundredth_row)
165 print(hundredth_row_is)
166 
167 
168 def not_null_count(column):
169     column_null = pd.isnull(column)
170     null = column[column_null]
171     return len(null)
172 
173 
174 column_null_count = titanic_survival.apply(not_null_count)
175 print(column_null_count)
176 
177 print("""\n---------------------------------数据结构---------------------------------------------
178 -----------pandas内部dataframe是由一系列的series结构组成的(行或列)---------------------\n""")
179 fandango = pd.read_csv("fandango_score_comparison.csv")
180 series_film = fandango['FILM']
181 print(type(series_film))
182 print(series_film[0:5])
183 series_rt = fandango['RottenTomatoes']
184 print(series_rt[0:5])
185 
186 print("""\n---------------------------------数据结构---------------------------------------------
187 -----------------------series结构是由Ndarray组成的-----------------------------------------------\n""")
188 film_names = series_film.values
189 print(type(film_names))
190 # print(film_names)
191 rt_scores = series_rt.values
192 # print(rt_scores)
193 series_custom = Series(rt_scores, index = film_names)  # (什么内容的数值，对应的索引,可以用str当做索引[str:str])
194 print(series_custom[["Minions (2015)", "Leviathan (2014)"]])
195 fiveten = series_custom[5:10]
196 print(fiveten)

运行结果：

D:\Python\python.exe G:/编程/python/project/TYD/01/01/03/pandas_test.py

---------------------------------读取文件---------------------------------------------
-----------------------读取文件函数read_csv("文件路径")------------------------------------

<class 'pandas.core.frame.DataFrame'>
NDB_No int64
Shrt_Desc object
Water_(g) float64
Energ_Kcal int64
Protein_(g) float64
Lipid_Tot_(g) float64
Ash_(g) float64
Carbohydrt_(g) float64
Fiber_TD_(g) float64
Sugar_Tot_(g) float64
Calcium_(mg) float64
Iron_(mg) float64
Magnesium_(mg) float64
Phosphorus_(mg) float64
Potassium_(mg) float64
Sodium_(mg) float64
Zinc_(mg) float64
Copper_(mg) float64
Manganese_(mg) float64
Selenium_(mcg) float64
Vit_C_(mg) float64
Thiamin_(mg) float64
Riboflavin_(mg) float64
Niacin_(mg) float64
Vit_B6_(mg) float64
Vit_B12_(mcg) float64
Vit_A_IU float64
Vit_A_RAE float64
Vit_E_(mg) float64
Vit_D_mcg float64
Vit_D_IU float64
Vit_K_(mcg) float64
FA_Sat_(g) float64
FA_Mono_(g) float64
FA_Poly_(g) float64
Cholestrl_(mg) float64
dtype: object

---------------------------------数据操作---------------------------------------------
-----------------------显示前N条数据(行)food_info.head(条数[默认5行])------------------------------------

NDB_No Shrt_Desc ... FA_Poly_(g) Cholestrl_(mg)
0 1001 BUTTER WITH SALT ... 3.043 215.0
1 1002 BUTTER WHIPPED WITH SALT ... 3.012 219.0
2 1003 BUTTER OIL ANHYDROUS ... 3.694 256.0

[3 rows x 36 columns]

---------------------------------数据操作---------------------------------------------
-----------------------显示尾N条数据(行)food_info.tail(条数[默认5行])------------------------------------

NDB_No ... Cholestrl_(mg)
8614 90240 ... 41.0
8615 90480 ... 0.0
8616 90560 ... 50.0
8617 93600 ... 50.0

[4 rows x 36 columns]

---------------------------------数据操作---------------------------------------------
---------------------------显示列名food_info.columns------------------------------------

Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
'Cholestrl_(mg)'],
dtype='object')

---------------------------------数据操作---------------------------------------------
---------------------------查看矩阵的行列数food_info.shape------------------------------------

(8618, 36)

---------------------------------数据操作---------------------------------------------
---------------------------根据索引取出数据food_info.loc[索引]------------------------------------

NDB_No 1001
Shrt_Desc BUTTER WITH SALT
Water_(g) 15.87
Energ_Kcal 717
Protein_(g) 0.85
Lipid_Tot_(g) 81.11
Ash_(g) 2.11
Carbohydrt_(g) 0.06
Fiber_TD_(g) 0
Sugar_Tot_(g) 0.06
Calcium_(mg) 24
Iron_(mg) 0.02
Magnesium_(mg) 2
Phosphorus_(mg) 24
Potassium_(mg) 24
Sodium_(mg) 643
Zinc_(mg) 0.09
Copper_(mg) 0
Manganese_(mg) 0
Selenium_(mcg) 1
Vit_C_(mg) 0
Thiamin_(mg) 0.005
Riboflavin_(mg) 0.034
Niacin_(mg) 0.042
Vit_B6_(mg) 0.003
Vit_B12_(mcg) 0.17
Vit_A_IU 2499
Vit_A_RAE 684
Vit_E_(mg) 2.32
Vit_D_mcg 1.5
Vit_D_IU 60
Vit_K_(mcg) 7
FA_Sat_(g) 51.368
FA_Mono_(g) 21.021
FA_Poly_(g) 3.043
Cholestrl_(mg) 215
Name: 0, dtype: object

---------------------------------数据操作---------------------------------------------
-----------------根据列名取出数据food_info["列名","列名"]或者food_info["列名"]--------------------------

0 1001
1 1002
2 1003
3 1004
4 1005
5 1006
6 1007
7 1008
8 1009
9 1010
10 1011
11 1012
12 1013
13 1014
14 1015
15 1016
16 1017
17 1018
18 1019
19 1020
20 1021
21 1022
22 1023
23 1024
24 1025
25 1026
26 1027
27 1028
28 1029
29 1030
...
8588 43544
8589 43546
8590 43550
8591 43566
8592 43570
8593 43572
8594 43585
8595 43589
8596 43595
8597 43597
8598 43598
8599 44005
8600 44018
8601 44048
8602 44055
8603 44061
8604 44074
8605 44110
8606 44158
8607 44203
8608 44258
8609 44259
8610 44260
8611 48052
8612 80200
8613 83110
8614 90240
8615 90480
8616 90560
8617 93600
Name: NDB_No, Length: 8618, dtype: int64
NDB_No Energ_Kcal
0 1001 717
1 1002 717
2 1003 876
3 1004 353
4 1005 371
5 1006 334
6 1007 300
7 1008 376
8 1009 406
9 1010 387
10 1011 394
11 1012 98
12 1013 97
13 1014 72
14 1015 81
15 1016 72
16 1017 342
17 1018 357
18 1019 264
19 1020 389
20 1021 466
21 1022 356
22 1023 413
23 1024 327
24 1025 373
25 1026 300
26 1027 318
27 1028 254
28 1029 301
29 1030 368
... ... ...
8588 43544 389
8589 43546 91
8590 43550 68
8591 43566 465
8592 43570 401
8593 43572 429
8594 43585 73
8595 43589 179
8596 43595 377
8597 43597 280
8598 43598 688
8599 44005 884
8600 44018 279
8601 44048 257
8602 44055 319
8603 44061 356
8604 44074 62
8605 44110 179
8606 44158 181
8607 44203 287
8608 44258 365
8609 44259 351
8610 44260 350
8611 48052 370
8612 80200 73
8613 83110 305
8614 90240 111
8615 90480 269
8616 90560 90
8617 93600 89

[8618 rows x 2 columns]

---------------------------------数据操作---------------------------------------------
------------------------------------寻找以g为结尾的列名--------------------------

['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)']
Water_(g) Protein_(g) ... FA_Mono_(g) FA_Poly_(g)
0 15.87 0.85 ... 21.021 3.043
1 15.87 0.85 ... 23.426 3.012
2 0.24 0.28 ... 28.732 3.694

[3 rows x 10 columns]

---------------------------------数据操作---------------------------------------------
------------------------------------数据除以常数--------------------------

0 0.02
1 0.16
2 0.00
3 0.31
4 0.43
5 0.50
6 0.33
7 0.64
8 0.16
9 0.21
10 0.76
11 0.07
12 0.16
13 0.15
14 0.13
15 0.14
16 0.38
17 0.44
18 0.65
19 0.23
20 0.52
21 0.24
22 0.17
23 0.13
24 0.72
25 0.44
26 0.20
27 0.22
28 0.23
29 0.41
...
8588 9.00
8589 0.30
8590 0.10
8591 1.63
8592 34.82
8593 2.28
8594 0.17
8595 0.17
8596 4.86
8597 0.25
8598 0.23
8599 0.13
8600 0.11
8601 0.68
8602 7.83
8603 3.11
8604 0.30
8605 0.18
8606 0.80
8607 0.04
8608 3.87
8609 0.05
8610 0.38
8611 5.20
8612 1.50
8613 1.40
8614 0.58
8615 3.60
8616 3.50
8617 1.40
Name: Iron_(mg), Length: 8618, dtype: float64
0 0.00002
1 0.00016
2 0.00000
3 0.00031
4 0.00043
5 0.00050
6 0.00033
7 0.00064
8 0.00016
9 0.00021
10 0.00076
11 0.00007
12 0.00016
13 0.00015
14 0.00013
15 0.00014
16 0.00038
17 0.00044
18 0.00065
19 0.00023
20 0.00052
21 0.00024
22 0.00017
23 0.00013
24 0.00072
25 0.00044
26 0.00020
27 0.00022
28 0.00023
29 0.00041
...
8588 0.00900
8589 0.00030
8590 0.00010
8591 0.00163
8592 0.03482
8593 0.00228
8594 0.00017
8595 0.00017
8596 0.00486
8597 0.00025
8598 0.00023
8599 0.00013
8600 0.00011
8601 0.00068
8602 0.00783
8603 0.00311
8604 0.00030
8605 0.00018
8606 0.00080
8607 0.00004
8608 0.00387
8609 0.00005
8610 0.00038
8611 0.00520
8612 0.00150
8613 0.00140
8614 0.00058
8615 0.00360
8616 0.00350
8617 0.00140
Name: Iron_(mg), Length: 8618, dtype: float64

---------------------------------数据操作---------------------------------------------
------------------------------------元素对应位置相乘，表格增加列(维度相同)--------------------------

(8618, 36)
(8618, 37)

---------------------------------数据操作---------------------------------------------
------------------------------------元素对应位置加减乘除--------------------------

0 -59.1325
1 -59.1325
2 -74.0500
3 21.2450
4 24.2200
dtype: float64

---------------------------------数据操作---------------------------------------------
------------------------------------最大值，最小值，均值-----------------------------------------------

0 0.009624
1 0.009624
2 0.003170
3 0.242301
4 0.263134
Name: Normalized_Protein, dtype: float64
0 0.8111
1 0.8111
2 0.9948
3 0.2874
4 0.2968
Name: Normalized_Fat, dtype: float64

---------------------------------数据操作---------------------------------------------
------------------------------------排序----------------------------------------------------------

760 0.0
758 0.0
405 0.0
761 0.0
2269 0.0
763 0.0
764 0.0
770 0.0
774 0.0
396 0.0
395 0.0
6827 0.0
394 0.0
393 0.0
391 0.0
390 0.0
787 0.0
788 0.0
2270 0.0
2231 0.0
407 0.0
748 0.0
409 0.0
747 0.0
702 0.0
703 0.0
704 0.0
705 0.0
706 0.0
707 0.0
...
8153 NaN
8155 NaN
8156 NaN
8157 NaN
8158 NaN
8159 NaN
8160 NaN
8161 NaN
8163 NaN
8164 NaN
8165 NaN
8167 NaN
8169 NaN
8170 NaN
8172 NaN
8173 NaN
8174 NaN
8175 NaN
8176 NaN
8177 NaN
8178 NaN
8179 NaN
8180 NaN
8181 NaN
8183 NaN
8184 NaN
8185 NaN
8195 NaN
8251 NaN
8267 NaN
Name: Sodium_(mg), Length: 8618, dtype: float64
276 38758.0
5814 27360.0
6192 26050.0
1242 26000.0
1245 24000.0
1243 24000.0
1244 23875.0
292 17000.0
1254 11588.0
5811 10600.0
8575 9690.0
291 8068.0
1249 8031.0
5812 7893.0
1292 7851.0
293 7203.0
4472 7027.0
4836 6820.0
1261 6580.0
3747 6008.0
1266 5730.0
4835 5586.0
4834 5493.0
1263 5356.0
1553 5203.0
1552 5053.0
1251 4957.0
1257 4843.0
294 4616.0
8613 4450.0
...
8153 NaN
8155 NaN
8156 NaN
8157 NaN
8158 NaN
8159 NaN
8160 NaN
8161 NaN
8163 NaN
8164 NaN
8165 NaN
8167 NaN
8169 NaN
8170 NaN
8172 NaN
8173 NaN
8174 NaN
8175 NaN
8176 NaN
8177 NaN
8178 NaN
8179 NaN
8180 NaN
8181 NaN
8183 NaN
8184 NaN
8185 NaN
8195 NaN
8251 NaN
8267 NaN
Name: Sodium_(mg), Length: 8618, dtype: float64

---------------------------------数据操作---------------------------------------------
------------------------------------泰坦尼克号获救------------------------------------------------

PassengerId Survived Pclass ... Fare Cabin Embarked
0 1 0 3 ... 7.2500 NaN S
1 2 1 1 ... 71.2833 C85 C
2 3 1 3 ... 7.9250 NaN S
3 4 1 1 ... 53.1000 C123 S
4 5 0 3 ... 8.0500 NaN S

[5 rows x 12 columns]
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
5 NaN
6 54.0
7 2.0
8 27.0
9 14.0
10 4.0
Name: Age, dtype: float64
0 False
1 False
2 False
3 False
4 False
5 True
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 False
17 True
18 False
19 True
20 False
21 False
22 False
23 False
24 False
25 False
26 True
27 False
28 True
29 True
...
861 False
862 False
863 True
864 False
865 False
866 False
867 False
868 True
869 False
870 False
871 False
872 False
873 False
874 False
875 False
876 False
877 False
878 True
879 False
880 False
881 False
882 False
883 False
884 False
885 False
886 False
887 False
888 True
889 False
890 False
Name: Age, Length: 891, dtype: bool
5 NaN
17 NaN
19 NaN
26 NaN
28 NaN
29 NaN
31 NaN
32 NaN
36 NaN
42 NaN
45 NaN
46 NaN
47 NaN
48 NaN
55 NaN
64 NaN
65 NaN
76 NaN
77 NaN
82 NaN
87 NaN
95 NaN
101 NaN
107 NaN
109 NaN
121 NaN
126 NaN
128 NaN
140 NaN
154 NaN
..
718 NaN
727 NaN
732 NaN
738 NaN
739 NaN
740 NaN
760 NaN
766 NaN
768 NaN
773 NaN
776 NaN
778 NaN
783 NaN
790 NaN
792 NaN
793 NaN
815 NaN
825 NaN
826 NaN
828 NaN
832 NaN
837 NaN
839 NaN
846 NaN
849 NaN
859 NaN
863 NaN
868 NaN
878 NaN
888 NaN
Name: Age, Length: 177, dtype: float64
177
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
6 54.0
7 2.0
8 27.0
9 14.0
10 4.0
11 58.0
12 20.0
13 39.0
14 14.0
15 55.0
16 2.0
18 31.0
20 35.0
21 34.0
22 15.0
23 28.0
24 8.0
25 38.0
27 19.0
30 40.0
33 66.0
34 28.0
35 42.0
37 21.0
38 18.0
...
856 45.0
857 51.0
858 24.0
860 41.0
861 21.0
862 48.0
864 24.0
865 42.0
866 27.0
867 31.0
869 4.0
870 26.0
871 47.0
872 33.0
873 47.0
874 28.0
875 15.0
876 20.0
877 19.0
879 56.0
880 25.0
881 33.0
882 22.0
883 28.0
884 25.0
885 39.0
886 27.0
887 19.0
889 26.0
890 32.0
Name: Age, Length: 714, dtype: float64
29.69911764705882

---------------------------------数据操作---------------------------------------------
------------------------------------泰坦尼克号获救----现成的均值函数mean()---------------------------------

29.69911764705882

---------------------------------数据操作---------------------------------------------
------------------------------------泰坦尼克号获救----不同等级仓位的平均价格---------------------------------

{1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993}

---------------------------------数据操作---------------------------------------------
-----------泰坦尼克号获救----不同数据关联求值titanic_survival.pivot_table()---------------------

---------index:以谁为基准--values：统计index跟什么之间的关系-----aggfunc：什么关系-------------
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
---------看仓位与年龄的关系-------------
Age
Pclass
1 38.233441
2 29.877630
3 25.140620
---------看登船地点与（船票价格和获救）的关系-------------
Fare Survived
Embarked
C 10072.2962 93
Q 1022.2543 30
S 17439.3988 217
---------丢弃缺失值--dropna(axis = 1行0列)-----------
PassengerId Survived Pclass ... Fare Cabin Embarked
0 1 0 3 ... 7.2500 NaN S
1 2 1 1 ... 71.2833 C85 C
2 3 1 3 ... 7.9250 NaN S
3 4 1 1 ... 53.1000 C123 S
4 5 0 3 ... 8.0500 NaN S
6 7 0 1 ... 51.8625 E46 S
7 8 0 3 ... 21.0750 NaN S
8 9 1 3 ... 11.1333 NaN S
9 10 1 2 ... 30.0708 NaN C
10 11 1 3 ... 16.7000 G6 S
11 12 1 1 ... 26.5500 C103 S
12 13 0 3 ... 8.0500 NaN S
13 14 0 3 ... 31.2750 NaN S
14 15 0 3 ... 7.8542 NaN S
15 16 1 2 ... 16.0000 NaN S
16 17 0 3 ... 29.1250 NaN Q
18 19 0 3 ... 18.0000 NaN S
20 21 0 2 ... 26.0000 NaN S
21 22 1 2 ... 13.0000 D56 S
22 23 1 3 ... 8.0292 NaN Q
23 24 1 1 ... 35.5000 A6 S
24 25 0 3 ... 21.0750 NaN S
25 26 1 3 ... 31.3875 NaN S
27 28 0 1 ... 263.0000 C23 C25 C27 S
30 31 0 1 ... 27.7208 NaN C
33 34 0 2 ... 10.5000 NaN S
34 35 0 1 ... 82.1708 NaN C
35 36 0 1 ... 52.0000 NaN S
37 38 0 3 ... 8.0500 NaN S
38 39 0 3 ... 18.0000 NaN S
.. ... ... ... ... ... ... ...
856 857 1 1 ... 164.8667 NaN S
857 858 1 1 ... 26.5500 E17 S
858 859 1 3 ... 19.2583 NaN C
860 861 0 3 ... 14.1083 NaN S
861 862 0 2 ... 11.5000 NaN S
862 863 1 1 ... 25.9292 D17 S
864 865 0 2 ... 13.0000 NaN S
865 866 1 2 ... 13.0000 NaN S
866 867 1 2 ... 13.8583 NaN C
867 868 0 1 ... 50.4958 A24 S
869 870 1 3 ... 11.1333 NaN S
870 871 0 3 ... 7.8958 NaN S
871 872 1 1 ... 52.5542 D35 S
872 873 0 1 ... 5.0000 B51 B53 B55 S
873 874 0 3 ... 9.0000 NaN S
874 875 1 2 ... 24.0000 NaN C
875 876 1 3 ... 7.2250 NaN C
876 877 0 3 ... 9.8458 NaN S
877 878 0 3 ... 7.8958 NaN S
879 880 1 1 ... 83.1583 C50 C
880 881 1 2 ... 26.0000 NaN S
881 882 0 3 ... 7.8958 NaN S
882 883 0 3 ... 10.5167 NaN S
883 884 0 2 ... 10.5000 NaN S
884 885 0 3 ... 7.0500 NaN S
885 886 0 3 ... 29.1250 NaN Q
886 887 0 2 ... 13.0000 NaN S
887 888 1 1 ... 30.0000 B42 S
889 890 1 1 ... 30.0000 C148 C
890 891 0 3 ... 7.7500 NaN Q

[714 rows x 12 columns]
---------定位到具体值-----------
28.0
1
---------按照某数据排序后，重新排序index-----------
PassengerId Survived Pclass ... Fare Cabin Embarked
630 631 1 1 ... 30.0000 A23 S
851 852 0 3 ... 7.7750 NaN S
493 494 0 1 ... 49.5042 NaN C
96 97 0 1 ... 34.6542 A5 C
116 117 0 3 ... 7.7500 NaN Q
672 673 0 2 ... 10.5000 NaN S
745 746 0 1 ... 71.0000 B22 S
33 34 0 2 ... 10.5000 NaN S
54 55 0 1 ... 61.9792 B30 C
280 281 0 3 ... 7.7500 NaN Q

[10 rows x 12 columns]
-----------
PassengerId Survived Pclass ... Fare Cabin Embarked
0 631 1 1 ... 30.0000 A23 S
1 852 0 3 ... 7.7750 NaN S
2 494 0 1 ... 49.5042 NaN C
3 97 0 1 ... 34.6542 A5 C
4 117 0 3 ... 7.7500 NaN Q
5 673 0 2 ... 10.5000 NaN S
6 746 0 1 ... 71.0000 B22 S
7 34 0 2 ... 10.5000 NaN S
8 55 0 1 ... 61.9792 B30 C
9 281 0 3 ... 7.7500 NaN Q

[10 rows x 12 columns]
---------apply函数-----------
PassengerId 100
Survived 0
Pclass 2
Name Kantor, Mr. Sinai
Sex male
Age 34
SibSp 1
Parch 0
Ticket 244367
Fare 26
Cabin NaN
Embarked S
dtype: object
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64

---------------------------------数据结构---------------------------------------------
-----------pandas内部dataframe是由一系列的series结构组成的(行或列)---------------------

<class 'pandas.core.series.Series'>
0 Avengers: Age of Ultron (2015)
1 Cinderella (2015)
2 Ant-Man (2015)
3 Do You Believe? (2015)
4 Hot Tub Time Machine 2 (2015)
Name: FILM, dtype: object
0 74
1 85
2 80
3 18
4 14
Name: RottenTomatoes, dtype: int64

---------------------------------数据结构---------------------------------------------
-----------------------series结构是由Ndarray组成的-----------------------------------------------

<class 'numpy.ndarray'>
Minions (2015) 54
Leviathan (2014) 99
dtype: int64
The Water Diviner (2015) 63
Irrational Man (2015) 42
Top Five (2014) 86
Shaun the Sheep Movie (2015) 99
Love & Mercy (2015) 89
dtype: int64

Process finished with exit code 0

所需附件：

food_info.csv

titanic_train.csv

fandango_score_comparison.csv

数据分析处理库-pandas

猜你喜欢