Python-pandas详解

一、pandas

pandas是基于NumPy的一种工具，该工具是为了解决数据分析任务而创建的。Pandas纳入了大量库和一些标准的数据模型，提供了高效地操作大型数据集所需的工具。pandas提供了大量能使我们快速便捷地处理数据的函数和方法。

Pandas主要基于两种数据类型：series与dataframe。

series是一个一维的数据类型，其中每一个元素都有一个标签。如果学过numpy的知识，就可以发现series类似于numpy中元素带标签的数组。其中，标签可以是数字或者字符串。
dataframe是一个二维的表结构。Pandas的dataframe可以存储许多种不同的数据类型，并且每一个坐标轴都有自己的标签。

下面就分别对两种数据类型进行学习与总结。

二、Series

创建series对象

import pandas as pd
s = pd.Series([1, 2, "hello", "hzc"])
print(s) # 结果左边是索引，右边是对应的值
# 0        1
# 1        2
# 2    hello
# 3      hzc
# dtype: object

生成一个指定索引的series

import pandas as pd
s = pd.Series(range(4), index=["a", "b", "c", "d"]) # 索引是a,b,c,d
print(s)
# a    0
# b    1
# c    2
# d    3
# dtype: int64

通过字典来创建series对象

import pandas as pd
dic = {"d": "hzc", "c": "lmn", "b": 123, "a": 100}
s = pd.Series(dic)
print(s)
# d    hzc
# c    lmn
# b    123
# a    100
# dtype: object

用字典生成series的时候，可以指定索引，当索引中值对应的字典中的值不存在的时候，则此索引的值标记为NaA，并且可以通过pandas.isnull和pandas.notnull来判断哪些索引对应的值是没有的。

s = pd.Series(dic, index=["a", "b", "x"])
# a    100
# b    123
# x    NaN
# dtype: object

print(pd.isnull(s))
# a    False
# b    False
# x     True
# dtype: bool

print(pd.notnull(s))
# a     True
# b     True
# x    False
# dtype: bool

访问series中的元素和索引

import pandas as pd
dic = {"d": "hzc", "c": "lmn", "b": 123, "a": 100}
s = pd.Series(dic)
print(s["a"])
# 100

print(s[["a", "b"]])
# a    100
# b    123

print(s.index)
# Index(['d', 'c', 'b', 'a'], dtype='object')

print(s.values)
# ['hzc' 'lmn' 123 100]

简单运算

s = s * 2
print(s)
# d    hzchzc
# c    lmnlmn
# b       246
# a       200
# dtype: object

三、Dateframe

DataFrame是一种表格型的数据结构，包含一组有序的列，每列可以是不同的值类型。DataFrame有行索引和列索引，可以看成由Series组成的字典。

输出6行4列的表格

import pandas as pd
import numpy as np
dates = pd.date_range('20180926', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])#生成6行4列位置
print(df) # 输出表格
#                    A         B         C         D
# 2018-09-26  0.787128  0.719132 -0.980213  0.188578
# 2018-09-27 -0.111568  0.018125 -2.418211  1.782390
# 2018-09-28  0.835537  2.340993 -0.191740  1.238414
# 2018-09-29 -0.502952  0.845483  0.078102  0.137999
# 2018-09-30  0.092992  0.812140  1.844608  0.622599
# 2018-10-01 -0.218961 -1.290632  1.370229  0.091662

print(df['B']) # 输出B列
# 2018-09-26    0.719132
# 2018-09-27    0.018125
# 2018-09-28    2.340993
# 2018-09-29    0.845483
# 2018-09-30    0.812140
# 2018-10-01   -1.290632

创建特定数据的DataFrame

import pandas as pd
dic = {"A": 1, "B": pd.Timestamp('20180926'), "C": "hzc", "D": pd.Series(np.random.randn(4))}
df = pd.DataFrame(dic)
print(df)
#    A          B    C         D
# 0  1 2018-09-26  hzc -0.487341
# 1  1 2018-09-26  hzc  1.410734
# 2  1 2018-09-26  hzc -1.971544
# 3  1 2018-09-26  hzc  0.897068

print(df.index)  # 行的序号
# RangeIndex(start=0, stop=4, step=1)

print(df.columns)  # 列的序号
# Index(['A', 'B', 'C', 'D'], dtype='object')

print(df.values)  # 每个值
# [[1 Timestamp('2018-09-26 00:00:00') 'hzc' -0.487341]
#  [1 Timestamp('2018-09-26 00:00:00') 'hzc' 1.410734]
#  [1 Timestamp('2018-09-26 00:00:00') 'hzc' -1.971544]
#  [1 Timestamp('2018-09-26 00:00:00') 'hzc' 0.897068]]

按值进行排序

print(df.sort_values(by='D'))  #按照D列进行排序
#    A          B    C         D
# 2  1 2018-09-26  hzc -1.971544
# 0  1 2018-09-26  hzc -0.487341
# 3  1 2018-09-26  hzc  0.897068
# 1  1 2018-09-26  hzc  1.410734

切片选择

print(df[0:3])  # 显示前三行的数据

根据标签loc-行标签进行选择数据

print(df.loc[0, ['C', 'D']]) # 根据行号和列号精确定位

根据序列iloc-行号进行选择数据

print(df.iloc[3, 3])  # 输出三行第三列的数据，从0开始计数
print(df.iloc[0:3,0:2])  # 进行切片选择，第0-2行的第0-1列
print(df.iloc[[1,2,4],[0,2]])  # 进行不连续筛选

根据loc和iloc设置数据

dates = pd.date_range('20180926', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
#              A   B   C   D
# 2018-09-26   0   1   2   3
# 2018-09-27   4   5   6   7
# 2018-09-28   8   9  10  11
# 2018-09-29  12  13  14  15
# 2018-09-30  16  17  18  19
# 2018-10-01  20  21  22  23

df.iloc[1, 2] = 100
print(df)
#              A   B    C   D
# 2018-09-26   0   1    2   3
# 2018-09-27   4   5  100   7
# 2018-09-28   8   9   10  11
# 2018-09-29  12  13   14  15
# 2018-09-30  16  17   18  19
# 2018-10-01  20  21   22  23

df.loc["2018-09-30", "C"] = 666
print(df)
#              A   B    C   D
# 2018-09-26   0   1    2   3
# 2018-09-27   4   5  100   7
# 2018-09-28   8   9   10  11
# 2018-09-29  12  13   14  15
# 2018-09-30  16  17  666  19
# 2018-10-01  20  21   22  23

pandas导入数据

file = pd.read_csv("C:\\Users\\hzcforever\\Desktop\\mydata.csv", header=0)
print(file.head(5))  # 打印csv表格数据的前五行
#        type gender  vage   age  ageg  region     ee  num     cost
#    0      B      F     1    18     1     BJ  26.890   20  33973.4
#    1      B      F     1    18     1     SH   3.089    2   2167.5
#    2      B      F     1    18     1     SZ   3.299    2   2082.5
#    3      B      F     1    18     1     CQ   1.000    0      0.0
#    4      B      F     1    19     1     BJ  31.493   23  36339.2
file.columns = ["type", "sex", "num", "age", "ageg", "location", "pride", "number", "sum"]  # 对表格列名重新命名
print(len(file))  # 输出表格数据的总行数
print(file[file["sum"] > 30000].head(5))  # 筛选出需要的数据的前五行

合并数据

df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])

res = pd.concat([df1, df2, df3], axis=1, ignore_index=True)  # axis为1表示横向合并，为0表示竖向合并，ignore_index置为True表示重排序号
print(res)
#     0    1    2    3    4    5    6    7    8    9    10   11
# 0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
# 1  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
# 2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0

merge合并

import pandas as pd

left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
print(left)
#   key   A   B
# 0  K0  A0  B0
# 1  K1  A1  B1
# 2  K2  A2  B2
# 3  K3  A3  B3
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2',  'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
print(right)
#   key   C   D
# 0  K0  C0  D0
# 1  K1  C1  D1
# 2  K2  C2  D2
# 3  K3  C3  D3
res = pd.merge(left, right, on="key")
print(res)
#   key   A   B   C   D
# 0  K0  A0  B0  C0  D0
# 1  K1  A1  B1  C1  D1
# 2  K2  A2  B2  C2  D2
# 3  K3  A3  B3  C3  D3

# 如果依据两列进行合并，又可细分为内联合并、外联合并、左联合并和友联合并，如果熟悉数据库相关知识会更容易理解，这里不继续展开

依据index合并

import pandas as pd

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=['K0', 'K1', 'K2'])
print(left)
#      A   B
# K0  A0  B0
# K1  A1  B1
# K2  A2  B2

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'], 'D': ['D0', 'D2', 'D3']}, index=['K0', 'K2', 'K3'])
print(right)
#      C   D
# K0  C0  D0
# K2  C2  D2
# K3  C3  D3

res = pd.merge(left, right, left_index=True, right_index=True, how='outer')  # 外联合并
print(res)
#       A    B    C    D
# K0   A0   B0   C0   D0
# K1   A1   B1  NaN  NaN
# K2   A2   B2   C2   D2
# K3  NaN  NaN   C3   D3

res = pd.merge(left, right, left_index=True, right_index=True, how='inner')  # 内联合并
print(res)
#      A   B   C   D
# K0  A0  B0  C0  D0
# K2  A2  B2  C2  D2

一、pandas

二、Series

三、Dateframe

猜你喜欢