Python pandas 的用法

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/youshijian99/article/details/80607733

数据类型
object          -     字符串
int                -     整形
float             -     浮点型
datatime      -     时间类型
bool             -     布尔型

name,age,slary
zhangsan,20,1000
lisi,30,2000.34
jim,25,1500
wang,34,3000.0
kate,32,9000.0
liming,25,1200.0


读取文本
import pandas

# 读取文本
test = pandas.read_csv("test.csv")

# 文本类型
print(type(test))

print("---------------------------------------------")
# 变量类型
print(test.dtypes)

#print("---------------------------------------------")
#print(help(pandas.read_csv))
运行结果
<class 'pandas.core.frame.DataFrame'>
---------------------------------------------
name      object
age        int64
slary    float64
dtype: object

显示读取的头部数据

# test.head() 显示读进来的数据,不会进行完整显示,只显示前五条数据。
# 如果想显示前3条,输入参数3即可。
test.head(2)

运行结果

显示读取的尾部数据
# 显示尾部两行
test.tail(2)

运行结果


显示每一列的指标

# 显示每一列的指标,即第一行数据
print(test.columns)

# 样本数,6个样本,每个样本3个数据
print(test.shape)
运行结果
Index(['name', 'age', 'slary'], dtype='object')
(6, 3)    # 6 个样本,每个样本3个指标

读取数据

# 取数据,取第0号数据
print(test.loc[0])
运行结果
name     zhangsan
age            20
slary        1000
Name: 0, dtype: object

读取数据片

# 取出索引3,4,5 行数据,索引从0开始
print(test.loc[3:5])
运行结果
     name  age   slary
3    wang   34  3000.0
4    kate   32  9000.0
5  liming   25  1200.0

读取任意数据

# 取出0,2,4行数据
# 方法一
test_index = [0,2,4]
print(test.loc[test_index])
运行结果
       name  age   slary
0  zhangsan   20  1000.0
2       jim   25  1500.0
4      kate   32  9000.0

# 取出0,2,4行数据
# 方法二
print(test.loc[[0,2,4]])
运行结果
       name  age   slary
0  zhangsan   20  1000.0
2       jim   25  1500.0
4      kate   32  9000.0

取一列数据

# 按列取出数据
print(test["name"])
运行结果
0    zhangsan
1        lisi
2         jim
3        wang
4        kate
5      liming
Name: name, dtype: object

取多列数据

# 定位到两个列
col = ["name","age"]
print(test[col])
运行结果
       name  age
0  zhangsan   20
1      lisi   30
2       jim   25
3      wang   34
4      kate   32
5    liming   25


---------------------------------------------------------------------------------------------------------------------
test.csv
NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Iron_(mg)
1001,BUTTER WITH SALT,15.87,717,0.02
1002,BUTTER WHIPPED WITH SALT,15.87,717,0.16
1003,BUTTER OIL ANHYDROUS,0.24,876,0
1004,CHEESE BLUE,42.41,353,0.31

1005,CHEESE BRICK,41.11,371,0.43


import pandas
food_info = pandas.read_csv("test.csv")
col_names = food_info.columns.tolist()
print(col_names)
gram_columns = []

print("---------------------------------------------------------------------------")
# 读取以 单位为 g 的数据 
for c in col_names:
    if c.endswith("(g)"):
        gram_columns.append(c)
print(food_info[gram_columns].columns)
运行结果
['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Iron_(mg)']
---------------------------------------------------------------------------
Index(['Water_(g)'], dtype='object')
# 单位转换, 把 mg 结尾的转换为 g
print(food_info["Iron_(mg)"])
print("--------------------------------------------------------------------------")
div_1000 = food_info["Iron_(mg)"] / 1000
print(div_1000)


print("--------------------------------------------------------------------------")
# 增加一列,并没有写入文件
food_info["Iron_(g)"] = div_1000
food_info.head()
运行结果
0    0.02
1    0.16
2    0.00
3    0.31
4    0.43
Name: Iron_(mg), dtype: float64
--------------------------------------------------------------------------
0    0.00002
1    0.00016
2    0.00000
3    0.00031
4    0.00043
Name: Iron_(mg), dtype: float64
--------------------------------------------------------------------------

# 对两个列进行组合,对应位置相乘
water_energy = food_info["Water_(g)"]*food_info["Energ_Kcal"]
print(water_energy)
运行结果
0    11378.79
1    11378.79
2      210.24
3    14970.73
4    15251.81
dtype: float64

归一化操作

# # 求一列中的最大值
max_calories = food_info["Energ_Kcal"].max()
print(max_calories)

print("--------------------------------------------------------------------------")
# 进行列归一化,把当前列除以最大值
normalized_calories = food_info["Energ_Kcal"] / max_calories
print(normalized_calories)
运行结果
876
--------------------------------------------------------------------------
0    0.818493
1    0.818493
2    1.000000
3    0.402968
4    0.423516
Name: Energ_Kcal, dtype: float64
排序操作
# 排序操作,从小到大
food_info.sort_values("Iron_(g)", inplace=True)
print(food_info["Iron_(g)"])

print("--------------------------------------------------------------------------")
# 排序操作,从大到小
food_info.sort_values("Iron_(g)", inplace=True, ascending=False)
print(food_info["Iron_(g)"])
运行结果
2    0.00000
0    0.00002
1    0.00016
3    0.00031
4    0.00043
Name: Iron_(g), dtype: float64
--------------------------------------------------------------------------
4    0.00043
3    0.00031
1    0.00016
0    0.00002
2    0.00000
Name: Iron_(g), dtype: float64


----------------------------------------------------------------------------------------------------------------------------

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S
12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S
13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S
14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S
15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S


import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("test.csv")
titanic_survival.head()
运行结果


# 观察年龄 列
age = titanic_survival["Age"]
print(age.loc[0:10])

print("-------------------------------------")
# 判断缺失值,FALSE 不是缺失值,TRUE是缺失值
age_is_null = pd.isnull(age)
print(age_is_null)

print("-------------------------------------")
# 找出所有缺失值
age_null_true = age[age_is_null]
print(age_null_true)

print("-------------------------------------")
# # 所有缺失值的个数
age_null_count = len(age_null_true)
print(age_null_count)

运行结果

0     22.0
1     38.0
2     26.0
3     35.0
4     35.0
5      NaN
6     54.0
7      2.0
8     27.0
9     14.0
10     4.0
Name: Age, dtype: float64
-------------------------------------
0     False
1     False
2     False
3     False
4     False
5      True
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
Name: Age, dtype: bool
-------------------------------------
5   NaN
Name: Age, dtype: float64
-------------------------------------
1
 
 
# 对缺失值进行处理,取平均年龄
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age)

print("-------------------------------------")
# 去掉缺失值,只取有的值
good_ages = titanic_survival["Age"][age_is_null == False]
corrent_mean_age = sum(good_ages) / len(good_ages)
print(corrent_mean_age)

print("-------------------------------------")
# python 提供函数处理
corrent_mean_age = titanic_survival["Age"].mean()
print(corrent_mean_age)

运行结果

 
 
nan
-------------------------------------
27.714285714285715
-------------------------------------
27.714285714285715
# 查看每个舱位登记平均价格是多少
passenger_classes = [1,2,3]
fares_by_class = {}
for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print(fares_by_class)

print("------------------------------------------------------")
# 查看坐一二三等舱各平均获救多少人
# index 统计以谁为基准的,以 Pclass 为基准值
# values 统计 Pclass 跟什么之间的关系
# aggfunc 统计 index 与 values 关系的,对 每一个 Pclass 的 平均获救人数是多少
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print(passenger_survival)
{1: 50.698949999999996, 2: 30.0708, 3: 12.77708}
------------------------------------------------------
        Survived
Pclass          
1           0.75
2           1.00
3           0.30

# 查看每个舱位等级下的平均年龄,默认是求均值
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_survival)
          Age
Pclass       
1       46.25
2       14.00
3       21.00
# 查看一个量跟其他两个量之间的关系
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
              Fare  Survived
Embarked                    
C         101.3541         2
Q           8.4583         0
S         250.8250         5
# 丢弃缺失值
drop_na_colums = titanic_survival.dropna(axis = 1)
new_titanic_survival = titanic_survival.dropna(axis=0, subset=["Age", "Sex"])
print(new_titanic_survival)
   PassengerId  Survived  Pclass  \
0             1         0       3   
1             2         1       1   
2             3         1       3   
3             4         1       1   
4             5         0       3   
6             7         0       1   
7             8         0       3   
8             9         1       3   
9            10         1       2   
10           11         1       3   
11           12         1       1   
12           13         0       3   
13           14         0       3   
14           15         0       3   

                                                 Name     Sex   Age  SibSp  \
0                             Braund, Mr. Owen Harris    male  22.0      1   
1   Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                              Heikkinen, Miss. Laina  female  26.0      0   
3        Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                            Allen, Mr. William Henry    male  35.0      0   
6                             McCarthy, Mr. Timothy J    male  54.0      0   
7                      Palsson, Master. Gosta Leonard    male   2.0      3   
8   Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0   
9                 Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1   
10                    Sandstrom, Miss. Marguerite Rut  female   4.0      1   
11                           Bonnell, Miss. Elizabeth  female  58.0      0   
12                     Saundercock, Mr. William Henry    male  20.0      0   
13                        Andersson, Mr. Anders Johan    male  39.0      1   
14               Vestrom, Miss. Hulda Amanda Adolfina  female  14.0      0   

    Parch            Ticket     Fare Cabin Embarked  
0       0         A/5 21171   7.2500   NaN        S  
1       0          PC 17599  71.2833   C85        C  
2       0  STON/O2. 3101282   7.9250   NaN        S  
3       0            113803  53.1000  C123        S  
4       0            373450   8.0500   NaN        S  
6       0             17463  51.8625   E46        S  
7       1            349909  21.0750   NaN        S  
8       2            347742  11.1333   NaN        S  
9       0            237736  30.0708   NaN        C  
10      1           PP 9549  16.7000    G6        S  
11      0            113783  26.5500  C103        S  
12      0         A/5. 2151   8.0500   NaN        S  
13      5            347082  31.2750   NaN        S  
14      0            350406   7.8542   NaN        S  
# 定位样本编号
row_index_4_age = titanic_survival.loc[4,"Age"]
print(row_index_4_age)
row_index_10_pclass = titanic_survival.loc[5,"Pclass"]
print(row_index_10_pclass)
35.0
3
# 按年龄降序排列
new_titanic_survival = titanic_survival.sort_values("Age", ascending=False)
print(new_titanic_survival)

# index 进行重新排序
print("-----------------------------------------------------------------------------")
titanic_reindex = new_titanic_survival.reset_index(drop=True)
print(titanic_reindex)
    PassengerId  Survived  Pclass  \
11           12         1       1   
6             7         0       1   
13           14         0       3   
1             2         1       1   
3             4         1       1   
4             5         0       3   
8             9         1       3   
2             3         1       3   
0             1         0       3   
12           13         0       3   
9            10         1       2   
14           15         0       3   
10           11         1       3   
7             8         0       3   
5             6         0       3   

                                                 Name     Sex   Age  SibSp  \
11                           Bonnell, Miss. Elizabeth  female  58.0      0   
6                             McCarthy, Mr. Timothy J    male  54.0      0   
13                        Andersson, Mr. Anders Johan    male  39.0      1   
1   Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
3        Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                            Allen, Mr. William Henry    male  35.0      0   
8   Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0   
2                              Heikkinen, Miss. Laina  female  26.0      0   
0                             Braund, Mr. Owen Harris    male  22.0      1   
12                     Saundercock, Mr. William Henry    male  20.0      0   
9                 Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1   
14               Vestrom, Miss. Hulda Amanda Adolfina  female  14.0      0   
10                    Sandstrom, Miss. Marguerite Rut  female   4.0      1   
7                      Palsson, Master. Gosta Leonard    male   2.0      3   
5                                    Moran, Mr. James    male   NaN      0   

    Parch            Ticket     Fare Cabin Embarked  
11      0            113783  26.5500  C103        S  
6       0             17463  51.8625   E46        S  
13      5            347082  31.2750   NaN        S  
1       0          PC 17599  71.2833   C85        C  
3       0            113803  53.1000  C123        S  
4       0            373450   8.0500   NaN        S  
8       2            347742  11.1333   NaN        S  
2       0  STON/O2. 3101282   7.9250   NaN        S  
0       0         A/5 21171   7.2500   NaN        S  
12      0         A/5. 2151   8.0500   NaN        S  
9       0            237736  30.0708   NaN        C  
14      0            350406   7.8542   NaN        S  
10      1           PP 9549  16.7000    G6        S  
7       1            349909  21.0750   NaN        S  
5       0            330877   8.4583   NaN        Q  
-----------------------------------------------------------------------------
    PassengerId  Survived  Pclass  \
0            12         1       1   
1             7         0       1   
2            14         0       3   
3             2         1       1   
4             4         1       1   
5             5         0       3   
6             9         1       3   
7             3         1       3   
8             1         0       3   
9            13         0       3   
10           10         1       2   
11           15         0       3   
12           11         1       3   
13            8         0       3   
14            6         0       3   

                                                 Name     Sex   Age  SibSp  \
0                            Bonnell, Miss. Elizabeth  female  58.0      0   
1                             McCarthy, Mr. Timothy J    male  54.0      0   
2                         Andersson, Mr. Anders Johan    male  39.0      1   
3   Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
4        Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
5                            Allen, Mr. William Henry    male  35.0      0   
6   Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0   
7                              Heikkinen, Miss. Laina  female  26.0      0   
8                             Braund, Mr. Owen Harris    male  22.0      1   
9                      Saundercock, Mr. William Henry    male  20.0      0   
10                Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1   
11               Vestrom, Miss. Hulda Amanda Adolfina  female  14.0      0   
12                    Sandstrom, Miss. Marguerite Rut  female   4.0      1   
13                     Palsson, Master. Gosta Leonard    male   2.0      3   
14                                   Moran, Mr. James    male   NaN      0   

    Parch            Ticket     Fare Cabin Embarked  
0       0            113783  26.5500  C103        S  
1       0             17463  51.8625   E46        S  
2       5            347082  31.2750   NaN        S  
3       0          PC 17599  71.2833   C85        C  
4       0            113803  53.1000  C123        S  
5       0            373450   8.0500   NaN        S  
6       2            347742  11.1333   NaN        S  
7       0  STON/O2. 3101282   7.9250   NaN        S  
8       0         A/5 21171   7.2500   NaN        S  
9       0         A/5. 2151   8.0500   NaN        S  
10      0            237736  30.0708   NaN        C  
11      0            350406   7.8542   NaN        S  
12      1           PP 9549  16.7000    G6        S  
13      1            349909  21.0750   NaN        S  
14      0            330877   8.4583   NaN        Q 


函数操作

# 函数,返回第10行数据
def ten_row(column):
    ten_item = column.loc[9]
    return ten_item
    
ten_row = titanic_survival.apply(ten_row)
print(ten_row)
PassengerId                                     10
Survived                                         1
Pclass                                           2
Name           Nasser, Mrs. Nicholas (Adele Achem)
Sex                                         female
Age                                             14
SibSp                                            1
Parch                                            0
Ticket                                      237736
Fare                                       30.0708
Cabin                                          NaN
Embarked                                         C
dtype: object
# 每一列缺失值得个数是多少
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)

column_null_count = titanic_survival.apply(not_null_count)
print(column_null_count)
PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age             1
SibSp           0
Parch           0
Ticket          0
Fare            0
Cabin          10
Embarked        0
dtype: int64
# 对数据进行一次转换
def which_class(row):
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"
    
classes = titanic_survival.apply(which_class, axis=1)
print(classes)
0      Third Class
1      First Class
2      Third Class
3      First Class
4      Third Class
5      Third Class
6      First Class
7      Third Class
8      Third Class
9     Second Class
10     Third Class
11     First Class
12     Third Class
13     Third Class
14     Third Class
dtype: object
# 对数据进行转换
def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False
    
minors = titanic_survival.apply(is_minor, axis=1)

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"
    
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print(age_labels)
0       adult
1       adult
2       adult
3       adult
4       adult
5     unknown
6       adult
7       minor
8       adult
9       minor
10      minor
11      adult
12      adult
13      adult
14      minor
dtype: object
# 数据关联处理
titanic_survival['age_labels']=age_labels
age_group_survial = titanic_survival.pivot_table(index="age_labels", values="Survived")
print(age_group_survial)
            Survived
age_labels          
adult            0.5
minor            0.5
unknown          0.0





 
 



















猜你喜欢

转载自blog.csdn.net/youshijian99/article/details/80607733