机器学习入门------pandas

food_info = pandas.read_csv("C:/Users/LENOVO/Desktop/food_info.csv")

print(food_info)

结果：无 pandas的read_csv是从文件中把内容读取进来

first_rows = food_info.head()
#print (first_rows)
#print(food_info.head(3))
print (food_info.columns)

#print (food_info.shape)

结果：

Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
       'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
       'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
       'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
       'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
       'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
       'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
       'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
       'Cholestrl_(mg)'],
      dtype='object')   head默认是前5行，head(3)指定前3行，columns是指列头，shape形状

#pandas uses zero-indexing
#Series object representing the row at index 0.
print (food_info.loc[1])

# Series object representing the seventh row.
#food_info.loc[6]

# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
#food_info.loc[8620]
#The object dtype is equivalent to a string in Python结果：

NDB_No                                 1002
Shrt_Desc          BUTTER WHIPPED WITH SALT
Water_(g)                             15.87
Energ_Kcal                              717
Protein_(g)                            0.85
Lipid_Tot_(g)                         81.11
Ash_(g)                                2.11
Carbohydrt_(g)                         0.06
Fiber_TD_(g)                              0   列头与所取的行

# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
food_info.loc[3:6]

# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
#two_five_ten = [2,5,10]
#food_info.loc[two_five_ten]

# Method 2
#food_info.loc[[2,5,10]]

与上面一样的道理

col_names = food_info.columns.tolist()
gram_columns = []
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]

print(gram_df.head(3))结果：

  Water_(g)  Protein_(g)  Lipid_Tot_(g)  Ash_(g)  Carbohydrt_(g)  \(表示分行显示)
0      15.87         0.85          81.11     2.11            0.06   
1      15.87         0.85          81.11     2.11            0.06   
2       0.24         0.28          99.48     0.00            0.00   

   Fiber_TD_(g)  Sugar_Tot_(g)  FA_Sat_(g)  FA_Mono_(g)  FA_Poly_(g)  
0             0           0.06      51.368       21.021        3.043  
1             0           0.06      50.489       23.426        3.012  
2             0           0.00      61.924       28.732        3.694

#print(food_info["Iron_(mg)"])
div_1000 = food_info["Iron_(mg)"] / 1000
print (div_1000)
# Adds 100 to each value in the column and returns a Series object.
add_100 = food_info["Iron_(mg)"] + 100

# Subtracts 100 from each value in the column and returns a Series object.
#sub_100 = food_info["Iron_(mg)"] - 100

# Multiplies each value in the column by 2 and returns a Series object.
#mult_2 = food_info["Iron_(mg)"]*2结果：

0       0.00002
1       0.00016
2       0.00000
3       0.00031
4       0.00043
5       0.00050
6       0.00033 取出文件中的规定的部分，然后对每一项进行操作+-*/

#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
iron_grams = food_info["Iron_(mg)"] / 1000
food_info["Iron_(g)"] = iron_grams

print(water_energy)结果：

0       11378.79
1       11378.79
2         210.24
3       14970.73
4       15251.81
5       16172.28
6       15540.00
7       14769.28
8       15062.60
9       14570.55   同上

#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.
#print food_info["Sodium_(mg)"]
food_info.sort_values("Sodium_(mg)", inplace=True)
print (food_info["Sodium_(mg)"])
#Sorts by descending order, rather than ascending.
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
#print (food_info["Sodium_(mg)"])结果：

扫描二维码关注公众号，回复： 2618101 查看本文章

760      0
8607     0
629      0
630      0
758      0
6470     0
654      0
8599     0
6463     0
633      0
635      0   一个是安装默认升序，一个是属性设置为false，则按照降序。inplace是指是否在原地方

结合：泰坦尼克号案例强化pandas

import pandas as pd

import numpy as np

titanic_survial = pd.read_csv("C:/Users/LENOVO/Desktop/titanic_train.csv")
titanic_survial.head()

读取部分文件内容展示

#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survial["Age"]
#rint(age.loc[0:10]){取出Age这一列的前10行}
age_is_null = pd.isnull(age)
#print (age_is_null){缺失就是true。存在就是false}
age_null_true = age[age_is_null]
print (age_null_true){找出缺失的位置}
age_null_count = len(age_null_true)

print(age_null_count){统计缺失的个数}

#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
titanic_survival[mean_age = sum(titanic_survial["Age"]) / len(titanic_survial["Age"])

print (mean_age)

结果：

nan  求年龄的平均数，但是缺失的部分也加入了，所有结果也是缺失的

good_ages = titanic_survial["Age"][age_is_null == False] {代表取出年龄不是缺失的部分}
print (good_ages)

correct_mean_age = sum(good_ages) / len(good_ages){年龄是正确的求均值}

print (correct_mean_age)结果：

29.6991176471

{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}

#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = titanic_survial.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print (passenger_survival)结果：

Pclass
1    0.629630
2    0.472826
3    0.242363  pandans自己将上面的方法封装了。index和valus是相互对应的，K-V一样。aggfunc是对应之间呈现什么样的关系，这里是求均值

port_stats =titanic_survial.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
print(port_stats)结果：

                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217   与上面一致，values也可以是多个

row_index_83_age = titanic_survial.loc[83,"Age"]
row_index_83_pclass = titanic_survial.loc[83,"Pclass"]
print (row_index_83_age)
print (row_index_1000_pclass)结果：

28.0
1   精确的定位到确定的一行到属性

new_titanic_survival = titanic_survial.sort_values("Age",ascending=False)
#print new_titanic_survial[0:10]
titanic_reindexed = new_titanic_survival.reset_index(drop=True){重新设置index}

print(titanic_reindexed.iloc[0:10])结果：{loc与iloc是不一样的，loc——通过行标签索引行数据，iloc——通过行号索引行数据，当行号和行标签都是数字时，无区别}

  PassengerId  Survived  Pclass                                  Name   Sex  \
0          631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
1          852         0       3                   Svensson, Mr. Johan  male   
2          494         0       1               Artagaveytia, Mr. Ramon  male   
3           97         0       1             Goldschmidt, Mr. George B  male   
4          117         0       3                  Connors, Mr. Patrick  male   
5          673         0       2           Mitchell, Mr. Henry Michael  male   
6          746         0       1          Crosby, Capt. Edward Gifford  male   
7           34         0       2                 Wheadon, Mr. Edward H  male   
8           55         0       1        Ostby, Mr. Engelhart Cornelius  male   
9          281         0       3                      Duane, Mr. Frank  male   

    Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
0  80.0      0      0       27042  30.0000   A23        S  
1  74.0      0      0      347060   7.7750   NaN        S  
2  71.0      0      0    PC 17609  49.5042   NaN        C  
3  71.0      0      0    PC 17754  34.6542    A5        C  
4  70.5      0      0      370369   7.7500   NaN        Q  
5  70.0      0      0  C.A. 24580  10.5000   NaN        S  
6  70.0      1      1   WE/P 5735  71.0000   B22        S  
7  66.0      0      0  C.A. 24579  10.5000   NaN        S  
8  65.0      0      1      113509  61.9792   B30        C  
9  65.0      0      0      336439   7.7500   NaN        Q

def hundredth_row(column):
    # Extract the hundredth item
    hundredth_item = column.iloc[99]
    return hundredth_item

# Return the hundredth item from each column
hundredth_row = titanic_survial. apply(hundredth_row)
print (hundredth_row)结果：

PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S  自定义第100行：但是需要用apply

机器学习入门------pandas

猜你喜欢