pandas简单操作（二）

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/8/10 10:09
# @Author  : limingyu
# @Site    : 
# @File    : Test_pandas_titanic_train.py
# @Software: PyCharm
#泰坦尼克船员获救的案例
#属性：PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
#编号，标签，仓位等级，乘客姓名，性别，年龄，家属数量，老人和孩子数量，船票编码，船票价格，船舱编号，登船地点
#NaN表示缺失值

import pandas as pd
import numpy as np
from pandas import Series
#读文本
titanic_survival = pd.read_csv("titanic_train.csv")
print(titanic_survival.head()) #取出csv中前5行
#PassengerId Survived Pclass Name     Sex   Age SibSp Parch Ticket     Fare Cabin Embarked
#   1         0        3    Braund    male   22   1     0   A/5 21171  7.25           S
#   2         1        1    Cumings   female 38   1     0   PC 17599  71.2833  C85    C
#   3         1        3    Heikkinen female 26   0     0 STON/O2.3101282 7.925       S
#取年龄属性
age = titanic_survival["Age"]
print(age.loc[0:5])  #取0到5行数据
#[5 rows x 12 columns]
#0     22.0
#1     38.0
#2     26.0
#3     35.0
#4     35.0
#5      NaN
age_is_null = pd.isnull(age)  #判断是否是缺失值,有的人没登记年龄
print(age_is_null)  #Name: Age, dtype: float64
                    #0      False
                    #1      False
                    #2      False
                    #3      False
                    #4      False
                    #5      True
                    #6      False等共890条
age_null_true = age[age_is_null] #boolen当索引,将所有True的值输出
print(age_null_true)  #Name: Age, Length: 891, dtype: bool
                      #5     NaN
                      #17    NaN
                      #19    NaN等
age_null_count = len(age_null_true)
print(age_null_count)  #Name: Age, Length: 177, dtype: float64
                        #177
#当数据中有缺失值，而没处理时，影响一些操作eg:求均值
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age) #结果异常 nan
#对缺失值处理后的操作 ，正常
good_ages = titanic_survival["Age"][age_is_null == False]
print(good_ages)  #打印所有有值的年龄
                  #0      22.0
                  #1      38.0
                  #2      26.0
                  #3      35.0
                  #4      35.0
                  #6      54.0等
#对正常年龄进行操作
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age) #29.69911764705882
#使用方法求均值
correct_mean_age = titanic_survival["Age"].mean()
print(correct_mean_age) #29.69911764705882


#计算每个仓位等级船票的平均价格
#方法一：自己写逻辑
passenger_classes = [1,2,3]
fares_by_class = {}#存放均值
for this_class in passenger_classes:
    #如果等级匹配正确，取出所有匹配样本
    pclass_rows =titanic_survival[titanic_survival["Pclass"] == this_class]
    print(pclass_rows)
#   PassengerId Survived Pclass    ...   Fare     Cabin  Embarked
#1     2         1       1    ...      71.2833     C85        C
#3     4         1       1    ...      53.1000     C123       S  等
    pclass_fares = pclass_rows["Fare"]
    fare_mean = pclass_fares.mean()
    fares_by_class[this_class] = fare_mean
print(fares_by_class) # 打印等级：价格均值
#{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}

#方法二：调用函数
#以index为基准，统计values与index的关系，统计量：aggfunc
fares_by_class = titanic_survival.pivot_table(index="Pclass",values="Fare",aggfunc=np.mean)
print(fares_by_class)  #Pclass
                          #1       84.154687
                          #2       20.662183
                          #3       13.675550
#统计每个等级获救平均人数
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
print(passenger_survival)  #Pclass
                            #1       0.629630
                            #2       0.472826
                            #3       0.242363
#统计每个仓位等级平均年龄，默认统计均值
passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age")
print(passenger_age) #Pclass
                     #   1   38.233441
                     #   2   29.877630
                     #   3   25.140620
#统计一个量与其他两个量间的关系
#统计登船地点与船票价格和获救与否的关系
port_stats = titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
print(port_stats) #每个码头，总收入和总获救人数
                #Embarked
                #   C    10072.2962    93
                #   Q    1022.2543     30
                #   S    17439.3988    217

print("------------------------")
#处理缺失值：全部丢掉
drop_na_columns = titanic_survival.dropna(axis=1) #对有缺失值的列全部丢掉
print(drop_na_columns)
#  PassengerId Survived Pclass  ...   Parch   Ticket    Fare
#0    1           0       3    ...      0    A/5 21171  7.2500
#eg:丢掉Cabin或Fare中缺失值的行
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Fare","Cabin"]) #对有缺失值的列全部丢掉
print(new_titanic_survival)
#   PassengerId Survived Pclass ...   Fare     Cabin  Embarked
#1       2         1       1    ...  71.2833    C85      C
#3       4         1       1    ...  53.1000    C123     S
#6       7         0       1    ...  51.8625    E46      S


#定位到具体值：输入样本号和列名
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age) #28.0
print(row_index_1000_pclass)  #1


#按年龄降序排序输出
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print(new_titanic_survival[0:5])
#     PassengerId  Survived  Pclass  Age    ...       Fare   Cabin  Embarked
#630     631         1          1    80.0   ...     30.0000   A23      S
#851     852         0          3    74.0   ...      7.7750   NaN      S
#493     494         0          1    71.0   ...     49.5042   NaN      C
#将上述标号从0开始,改变index索引
titanic_reindex = new_titanic_survival.reset_index(drop=True)
print(titanic_reindex[0:5])
#   PassengerId Survived Pclass  Age   ...    Fare   Cabin  Embarked
#0    631         1       1      80.0  ...  30.0000   A23       S
#1    852         0       3      74.0  ...  7.7750    NaN       S
#2    494         0       1      71.0  ...  49.5042   NaN       C


#自定义函数:输出第100行的数据
def hundredth_row(column):
    hundredth_item = column.loc[99]
    return hundredth_item
#apply():执行自定义函数
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)  #PassengerId        100
                      #Survived            0
                      #Pclass              2
                      #Name           Kantor, Mr. Sinai
                      #Sex                 male
                      #Age                 34
                      #SibSp                1
                      #Parch                0
                      #Ticket               244367
                      #Fare                 26
                      #Cabin                NaN
                      #Embarked             S
                      #dtype: object

#统计每列缺失值的个数
def not_null_count(column):  #循环所有列
    column_null = pd.isnull(column)  #判断第一列是否有空值
    print(column_null)  #输出bool值
    null = column[column_null]  #bool值作为索引，将空值对应的列输出
    print("--",null)
    #Series([], Name: PassengerId, dtype: object)
    #Series([], Name: Survived, dtype: object)
    #Series([], Name: Pclass, dtype: object)
    #Series([], Name: Name, dtype: object)
    #Series([], Name: Sex, dtype: object)
    #5      NaN
    #17     NaN
    #19     NaN
    #26     NaN
    #28     NaN
    #29     NaN
    #([], Name: SibSp, dtype: object)
    #Series([], Name: Parch, dtype: object)
    #Series([], Name: Ticket, dtype: object)
    #Series([], Name: Fare, dtype: object)
    #0      NaN
    #2      NaN
    #4      NaN
    #5      NaN ...
    #61     NaN
    #829    NaN
    return len(null)  #统计所有空值个数
column_null_count = titanic_survival.apply(not_null_count)
print(column_null_count)  #输出列及对应的空值数
                        #Name: Embarked, dtype: object
                        #PassengerId      0
                        #Survived         0
                        #Pclass           0
                        #Name             0
                        #Sex              0
                        #Age            177
                        #SibSp            0
                        #Parch            0
                        #Ticket           0
                        #Fare             0
                        #Cabin          687
                        #Embarked         2
                        #dtype: int64

#对仓位等级123，变成First Class,Second Class,Third Class
def which_class(row):  #判断属性pclass的所有行样本
    pclass =row['Pclass'] #定位pclass列
    if pd.isnull(pclass):
        return "Unknow"
    elif pclass ==1:
        return "First Class"
    elif pclass ==2:
        return "Second Class"
    elif pclass ==3:
        return "Third Class"
classes = titanic_survival.apply(which_class,axis=1)
print(classes)  #0       Third Class
                #1       First Class
                #2       Third Class

#将连续值变成离散值eg:年龄分成未成年，成年，缺失值
def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "Unknow"
    elif age > 18:
        return "adult"
    else:
        return "minor"
age_labels = titanic_survival.apply(generate_age_label,axis=1)
print(age_labels)  #0       adult
                   #1       adult

#统计当前获救人数和是否是成年人的关系
titanic_survival["age_labels"] = age_labels
age_grouo_survival = titanic_survival.pivot_table(index="age_labels",values="Survived")
print(age_grouo_survival)  #age_labels
                           #Unknow      0.293785
                           #adult       0.382609
                           #minor       0.503597



#DataFrame由一系列Series组成,Series里面的结构式ndarray
#DataFrame：相当于读取的矩阵，Series：矩阵中的一行或一列,ndarray:Series中的值
fandango = pd.read_csv('fandango_score_comparison.csv')
print(type(fandango))  #<class 'pandas.core.frame.DataFrame'>
series_film = fandango["FILM"]
print(type(series_film))  #<class 'pandas.core.series.Series'>
print(series_film[0:5])  #打印FILM前五列
                            #0    Avengers: Age of Ultron (2015)
                            #1                 Cinderella (2015)
                            #2                    Ant-Man (2015)
                            #3            Do You Believe? (2015)
                            #4     Hot Tub Time Machine 2 (2015)
                            #Name: FILM, dtype: object
series_rt = fandango['RottenTomatoes'] #媒体烂番茄
print(series_rt[0:5])  #打印烂番茄前五行
                        #0    74
                        #1    85
                        #2    80
                        #3    18
                        #4    14
                        #Name: RottenTomatoes, dtype: int64


#Series取出电影的值
film_names = series_film.values
print(series_film)  #取出索引及电影名
#0           Avengers: Age of Ultron (2015)
#1           Cinderella (2015)...
print(film_names)  #只取出电影名
#['Avengers: Age of Ultron (2015)' 'Cinderella  (2015)' 'Ant-Man (2015)'...
print(type(film_names))  #<class 'numpy.ndarray'>
rt_scores = series_rt.values #得到媒体的分值
print(rt_scores)  #[ 74  85  80  18  14  63  ...]
#自定义Series,一个电影名对应一个媒体评分，用电影名为索引
series_custom = Series(rt_scores,index=film_names)  #用string作为索引
print("--",series_custom)  #Avengers: Age of Ultron (2015)     74
                           #Cinderella (2015)                 85
#series_custom[['Minions (2015)','Leviathan (2014)']]
fiveten = series_custom[5:10]
print(fiveten)
#The Water Diviner (2015)        63
#Irrational Man (2015)           42
#Top Five (2014)                 86
#Shaun the Sheep Movie (2015)    99
#Love & Mercy (2015)             89
#dtype:


#Series排序
original_index = series_custom.index.tolist()  #将索引电影名转为list
print(original_index)  #打印电影名
sorted_index = sorted(original_index)  #对电影名按字典序排序
sorted_by_index = series_custom.reindex(sorted_index)
print(sorted_by_index)

#Series按照键(index)或值(value)排序
sc2 = series_custom.sort_index()
print(sc2[0:5]) #'71 (2015)                 97
                #5 Flights Up (2015)        52
                #A Little Chaos (2015)      40
                #A Most Violent Year (2014) 90
                #About Elly (2015)          97...
sc3 = series_custom.sort_values()
print(sc3[0:5]) #Paul Blart: Mall Cop 2 (2015) 5
                #Hitman: Agent 47 (2015)       7
                #Hot Pursu  (2015)            8
                #Fantastic Four (2015)         9
                #Taken 3 (2015)                9...
#Series加法
print(np.add(series_custom,series_custom))
#Avengers: Age of Ultron (2015)     148
#Cinderella (2015)                  170
print(np.sin(series_custom))
print(np.max(series_custom))
pandas简单操作（二）

猜你喜欢