Pandas基本用法

基于 Jupyter Notebook
资料来源

数据表的创建

Series

import pandas as pd

s = pd.Series([27.2, 27.65, 27.70, 28])
# s.values
# s.index

dates = pd.date_range('20191001', periods=4)
print(dates)
s2 = pd.Series([27.2, 27.65, 27.70, 28], index=dates)
s2.name = "海底捞股价"
s2

DatetimeIndex(['2019-10-01', '2019-10-02', '2019-10-03', '2019-10-04'], dtype='datetime64[ns]', freq='D')

2019-10-01    27.20
2019-10-02    27.65
2019-10-03    27.70
2019-10-04    28.00
Freq: D, Name: 海底捞股价, dtype: float64

import pandas as pd
import numpy as np

s = pd.Series(np.array([27.2, 27.65, 27.70, 28, np.nan]))
print( 'The length is', len(s) )
print( 'The shape is', s.shape )
print( 'The count is', s.count() )
print(s.unique())

print("-------")
print(s.value_counts())

The length is 5
The shape is (5,)
The count is 4
[27.2  27.65 27.7  28.     nan]
-------
27.70    1
27.65    1
27.20    1
28.00    1
dtype: int64

data_dict = {'BABA': 187.07, 'PDD': 21.83, 'JD': 30.79, 'BIDU': 184.77}
s3 = pd.Series(data_dict, name="中概股")
s3.index.name = "股票代号"
s3

股票代号
BABA    187.07
PDD      21.83
JD       30.79
BIDU    184.77
Name: 中概股, dtype: float64

stock = ['FB', 'BABA', 'PDD', 'JD']
datas = [160.0,28.0,25.0, np.nan ]
s4 = pd.Series(datas, index=stock)
s4 + s3

BABA    215.07
BIDU       NaN
FB         NaN
JD         NaN
PDD      46.83
dtype: float64

DataFrame

# 用列表或数组
# df1 = pd.DataFrame([[1,2,3], [4,5,6]])
df1 = pd.DataFrame(np.array([[1,2,3], [4,5,6]]))
df1

	0	1	2
0	1	2	3
1	4	5	6

symbol = ['BABA', 'JD', 'AAPL', 'MS', 'GS', 'WMT']
data = {'行业': ['电商', '电商', '科技', '金融', '金融', '零售'],
        '价格': [176.92, 25.95, 172.97, 41.79, 196.00, 99.55],
        '交易量': [16175610, 27113291, 18913154, 10132145, 2626634, 8086946],
        '雇员': [101550, 175336, 100000, 60348, 36600, 2200000]}
df2 = pd.DataFrame(data, index=symbol)
df2.name = "美股"
df2.index.name = "代号"
df2

	行业	价格	交易量	雇员
代号
BABA	电商	176.92	16175610	101550
JD	电商	25.95	27113291	175336
AAPL	科技	172.97	18913154	100000
MS	金融	41.79	10132145	60348
GS	金融	196.00	2626634	36600
WMT	零售	99.55	8086946	2200000

df2.values

array([['电商', 176.92, 16175610, 101550],
       ['电商', 25.95, 27113291, 175336],
       ['科技', 172.97, 18913154, 100000],
       ['金融', 41.79, 10132145, 60348],
       ['金融', 196.0, 2626634, 36600],
       ['零售', 99.55, 8086946, 2200000]], dtype=object)

df2.columns

Index(['行业', '价格', '交易量', '雇员'], dtype='object')

df2.index

Index(['BABA', 'JD', 'AAPL', 'MS', 'GS', 'WMT'], dtype='object', name='代号')

查看DataFrame

df2.head()  # 默认5行

	行业	价格	交易量	雇员
代号
BABA	电商	176.92	16175610	101550
JD	电商	25.95	27113291	175336
AAPL	科技	172.97	18913154	100000
MS	金融	41.79	10132145	60348
GS	金融	196.00	2626634	36600

df2.tail(3)

	行业	价格	交易量	雇员
代号
MS	金融	41.79	10132145	60348
GS	金融	196.00	2626634	36600
WMT	零售	99.55	8086946	2200000

统计DataFrame

df2.describe()
# 函数 describe() 只对「数值型变量」有用

	价格	交易量	雇员
count	6.000000	6.000000e+00	6.000000e+00
mean	118.863333	1.384130e+07	4.456390e+05
std	73.748714	8.717312e+06	8.607522e+05
min	25.950000	2.626634e+06	3.660000e+04
25%	56.230000	8.598246e+06	7.026100e+04
50%	136.260000	1.315388e+07	1.007750e+05
75%	175.932500	1.822877e+07	1.568895e+05
max	196.000000	2.711329e+07	2.200000e+06

升维DataFrame

df2.index = pd.MultiIndex.from_tuples( 
            [('中国公司','BABA'), ('中国公司','JD'), 
             ('美国公司','AAPL'), ('美国公司','MS'), 
             ('美国公司','GS'), ('美国公司','WMT')] )
df2

		行业	价格	交易量	雇员
中国公司	BABA	电商	176.92	16175610	101550
中国公司	JD	电商	25.95	27113291	175336
美国公司	AAPL	科技	172.97	18913154	100000
	MS	金融	41.79	10132145	60348
	GS	金融	196.00	2626634	36600
	WMT	零售	99.55	8086946	2200000

三维 Panel

Panel 在未来版本中会被废除

dates = pd.date_range('20190401',periods=4)

data = {'开盘价': [27.2, 27.65, 27.70, 28],
        '收盘价': [27.1, 27.55, 27.45, 28.1]}
df1 = pd.DataFrame( data, index=dates )

data = {'开盘价': [367, 369.8, 378.2, 380.6],
        '收盘价': [369.5, 370.1, 380, 382.1]}
df2 = pd.DataFrame( data, index=dates )

p_data = {'海底捞' : df1, '腾讯' : df2}
pn = pd.Panel(p_data)
pn

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2961: FutureWarning: 
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  exec(code_obj, self.user_global_ns, self.user_ns)





<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 4 (major_axis) x 2 (minor_axis)
Items axis: 海底捞 to 腾讯
Major_axis axis: 2019-04-01 00:00:00 to 2019-04-04 00:00:00
Minor_axis axis: 开盘价 to 收盘价

数据表的存载

# to_excel()
# to_csv()
# to_sql()
# to_hdf()

# read_excel()
# read_csv()
# read_sql()
# read_hdf()

Excel格式

df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))
df.to_excel('pd_excel.xlsx', sheet_name='Sheet1')

df1 = pd.read_excel('pd_excel.xlsx', sheet_name='Sheet1')
df1

	0	1	2
0	1	2	3
1	4	5	6

csv格式

# pd.to_csv( '文件名'，index=False )
# 注意如果 index 没有特意设定，最后不要把 index 值存到 csv 文件中

data = {'Code': ['BABA', '00700.HK', 'AAPL', '600519.SH'],
        'Name': ['阿里巴巴', '腾讯', '苹果', '茅台'],
        'Market': ['US', 'HK', 'US', 'SH'],
        'Price': [185.35, 380.2, 197, 900.2],
        'Currency': ['USD', 'HKD', 'USD', 'CNY']}

df = pd.DataFrame(data)
df.to_csv('pd_csv.csv', index=False)

df2 = pd.read_csv('pd_csv.csv')
df2

	Code	Name	Market	Price	Currency
0	BABA	阿里巴巴	US	185.35	USD
1	00700.HK	腾讯	HK	380.20	HKD
2	AAPL	苹果	US	197.00	USD
3	600519.SH	茅台	SH	900.20	CNY

数据表的索引和切片

symbol = ['BABA', 'JD', 'AAPL', 'MS', 'GS', 'WMT']
data = {'行业': ['电商', '电商', '科技', '金融', '金融', '零售'],
        '价格': [176.92, 25.95, 172.97, 41.79, 196.00, 99.55],
        '交易量': [16175610, 27113291, 18913154, 10132145, 2626634, 8086946],
        '雇员': [101550, 175336, 100000, 60348, 36600, 2200000]}
df = pd.DataFrame(data, index=symbol)
df.name = "美股"
df.indexname = "代号"
df

	行业	价格	交易量	雇员
BABA	电商	176.92	16175610	101550
JD	电商	25.95	27113291	175336
AAPL	科技	172.97	18913154	100000
MS	金融	41.79	10132145	60348
GS	金融	196.00	2626634	36600
WMT	零售	99.55	8086946	2200000

索引单元素

df.at['AAPL', '价格']

172.97

df.iat[2, 1]

172.97

切片 columns

df.价格

BABA    176.92
JD       25.95
AAPL    172.97
MS       41.79
GS      196.00
WMT      99.55
Name: 价格, dtype: float64

df['价格']

BABA    176.92
JD       25.95
AAPL    172.97
MS       41.79
GS      196.00
WMT      99.55
Name: 价格, dtype: float64

df.loc[:, '交易量']

BABA    16175610
JD      27113291
AAPL    18913154
MS      10132145
GS       2626634
WMT      8086946
Name: 交易量, dtype: int64

df.iloc[:, 0]

BABA    电商
JD      电商
AAPL    科技
MS      金融
GS      金融
WMT     零售
Name: 行业, dtype: object

df[['雇员', '价格']]

	雇员	价格
BABA	101550	176.92
JD	175336	25.95
AAPL	100000	172.97
MS	60348	41.79
GS	36600	196.00
WMT	2200000	99.55

df.loc[:, '行业':'交易量']

	行业	价格	交易量
BABA	电商	176.92	16175610
JD	电商	25.95	27113291
AAPL	科技	172.97	18913154
MS	金融	41.79	10132145
GS	金融	196.00	2626634
WMT	零售	99.55	8086946

df.iloc[:, 0:2]

	行业	价格
BABA	电商	176.92
JD	电商	25.95
AAPL	科技	172.97
MS	金融	41.79
GS	金融	196.00
WMT	零售	99.55

切片 index

df

	行业	价格	交易量	雇员
BABA	电商	176.92	16175610	101550
JD	电商	25.95	27113291	175336
AAPL	科技	172.97	18913154	100000
MS	金融	41.79	10132145	60348
GS	金融	196.00	2626634	36600
WMT	零售	99.55	8086946	2200000

df.loc['GS', :]

行业          金融
价格         196
交易量    2626634
雇员       36600
Name: GS, dtype: object

df.iloc[3, :]

行业           金融
价格        41.79
交易量    10132145
雇员        60348
Name: MS, dtype: object

df[1:2]

	行业	价格	交易量	雇员
JD	电商	25.95	27113291	175336

df['JD': 'JD']

	行业	价格	交易量	雇员
JD	电商	25.95	27113291	175336

df[1:4]

	行业	价格	交易量	雇员
JD	电商	25.95	27113291	175336
AAPL	科技	172.97	18913154	100000
MS	金融	41.79	10132145	60348

df['GS': 'WMT']

	行业	价格	交易量	雇员
GS	金融	196.00	2626634	36600
WMT	零售	99.55	8086946	2200000

df.loc['MS': 'GS', :]

	行业	价格	交易量	雇员
MS	金融	41.79	10132145	60348
GS	金融	196.00	2626634	36600

df.iloc[1:3, :]

	行业	价格	交易量	雇员
JD	电商	25.95	27113291	175336
AAPL	科技	172.97	18913154	100000

切片 index 和 columns

df.loc['GS': 'WMT', '价格']

GS     196.00
WMT     99.55
Name: 价格, dtype: float64

df.iloc[:2, 1:3]

	价格	交易量
BABA	176.92	16175610
JD	25.95	27113291

高级索引

print(df.雇员 >= 100000)
df.loc[df.雇员 >= 100000, :]

BABA     True
JD       True
AAPL     True
MS      False
GS      False
WMT      True
Name: 雇员, dtype: bool

	行业	价格	交易量	雇员
BABA	电商	176.92	16175610	101550
JD	电商	25.95	27113291	175336
AAPL	科技	172.97	18913154	100000
WMT	零售	99.55	8086946	2200000

print(df.dtypes == 'int64')
df.loc[:, df.dtypes == 'int64']

行业     False
价格     False
交易量     True
雇员      True
dtype: bool

	交易量	雇员
BABA	16175610	101550
JD	27113291	175336
AAPL	18913154	100000
MS	10132145	60348
GS	2626634	36600
WMT	8086946	2200000

df.loc[lambda x: x.交易量 > x.交易量.mean(), :]

	行业	价格	交易量	雇员
BABA	电商	176.92	16175610	101550
JD	电商	25.95	27113291	175336
AAPL	科技	172.97	18913154	100000

df.loc[lambda x: (x.交易量 > x.交易量.mean()) & (x.价格 > 100), :]

	行业	价格	交易量	雇员
BABA	电商	176.92	16175610	101550
AAPL	科技	172.97	18913154	100000

df.价格.loc[lambda x: x>100]

BABA    176.92
AAPL    172.97
GS      196.00
Name: 价格, dtype: float64

price = [190,32,196,192,200,189,31,30,199]
dates = ['2019-04-01']*3 + ['2019-04-02']*2+['2019-04-03']*2 + ['2019-04-04']*2
codes = ['BABA','JD','GS','BABA','GS','BABA','JD','JD','GS']

data = pd.Series( price,
                  index=[ dates, codes ])
data

2019-04-01  BABA    190
            JD       32
            GS      196
2019-04-02  BABA    192
            GS      200
2019-04-03  BABA    189
            JD       31
2019-04-04  JD       30
            GS      199
dtype: int64

data.index

MultiIndex(levels=[['2019-04-01', '2019-04-02', '2019-04-03', '2019-04-04'], ['BABA', 'GS', 'JD']],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 2, 1, 0, 1, 0, 2, 2, 1]])

data = {'地区': ['中国', '中国', '美国', '美国'],
        '代号': ['BABA', 'JD', 'MS', 'GS'],
        '行业': ['电商', '电商', '金融', '金融'],
        '价格': [176.92, 25.95, 41.79, 196.00],
        '交易量': [16175610, 27113291, 10132145, 2626634],
        '雇员': [101550, 175336, 60348, 36600] }
df = pd.DataFrame( data )
df

	地区	代号	行业	价格	交易量	雇员
0	中国	BABA	电商	176.92	16175610	101550
1	中国	JD	电商	25.95	27113291	175336
2	美国	MS	金融	41.79	10132145	60348
3	美国	GS	金融	196.00	2626634	36600

df2 = df.set_index(['地区', '代号'])
df2

		行业	价格	交易量	雇员
地区	代号
中国	BABA	电商	176.92	16175610	101550
中国	JD	电商	25.95	27113291	175336
美国	MS	金融	41.79	10132145	60348
美国	GS	金融	196.00	2626634	36600

df2.reset_index()

	地区	代号	行业	价格	交易量	雇员
0	中国	BABA	电商	176.92	16175610	101550
1	中国	JD	电商	25.95	27113291	175336
2	美国	MS	金融	41.79	10132145	60348
3	美国	GS	金融	196.00	2626634	36600

数据表的合并和连接

合并

df_price = pd.DataFrame( {'Date': pd.date_range('2019-1-1', periods=4),
                          'Adj Close': [24.42, 25.00, 25.25, 25.64]})
df_price

	Date	Adj Close
0	2019-01-01	24.42
1	2019-01-02	25.00
2	2019-01-03	25.25
3	2019-01-04	25.64

df_volume = pd.DataFrame( {'Date': pd.date_range('2019-1-2', periods=5),
                           'Volume' : [56081400, 99455500, 83028700, 100234000, 73829000]})
df_volume

	Date	Volume
0	2019-01-02	56081400
1	2019-01-03	99455500
2	2019-01-04	83028700
3	2019-01-05	100234000
4	2019-01-06	73829000

pd.merge(df_price, df_volume, how='left')

	Date	Adj Close	Volume
0	2019-01-01	24.42	NaN
1	2019-01-02	25.00	56081400.0
2	2019-01-03	25.25	99455500.0
3	2019-01-04	25.64	83028700.0

pd.merge(df_price, df_volume, how='right')

	Date	Adj Close	Volume
0	2019-01-02	25.00	56081400
1	2019-01-03	25.25	99455500
2	2019-01-04	25.64	83028700
3	2019-01-05	NaN	100234000
4	2019-01-06	NaN	73829000

pd.merge(df_price, df_volume, how='outer')

	Date	Adj Close	Volume
0	2019-01-01	24.42	NaN
1	2019-01-02	25.00	56081400.0
2	2019-01-03	25.25	99455500.0
3	2019-01-04	25.64	83028700.0
4	2019-01-05	NaN	100234000.0
5	2019-01-06	NaN	73829000.0

pd.merge(df_price, df_volume, how='inner')

	Date	Adj Close	Volume
0	2019-01-02	25.00	56081400
1	2019-01-03	25.25	99455500
2	2019-01-04	25.64	83028700

多键合并

porfolio1 = pd.DataFrame({'Asset': ['FX', 'FX', 'IR'], 
                          'Instrument': ['Option', 'Swap', 'Option'], 
                          'Number': [1, 2, 3]})
porfolio1

	Asset	Instrument	Number
0	FX	Option	1
1	FX	Swap	2
2	IR	Option	3

porfolio2 = pd.DataFrame({'Asset': ['FX', 'FX', 'FX', 'IR'], 
                          'Instrument': ['Option', 'Option', 'Swap', 'Swap'], 
                          'Number': [4, 5, 6, 7]})
porfolio2

	Asset	Instrument	Number
0	FX	Option	4
1	FX	Option	5
2	FX	Swap	6
3	IR	Swap	7

pd.merge(porfolio1, porfolio2, on=['Asset', 'Instrument'], how='outer')

	Asset	Instrument	Number_x	Number_y
0	FX	Option	1.0	4.0
1	FX	Option	1.0	5.0
2	FX	Swap	2.0	6.0
3	IR	Option	3.0	NaN
4	IR	Swap	NaN	7.0

pd.merge(porfolio1, porfolio2, on='Asset')

	Asset	Instrument_x	Number_x	Instrument_y	Number_y
0	FX	Option	1	Option	4
1	FX	Option	1	Option	5
2	FX	Option	1	Swap	6
3	FX	Swap	2	Option	4
4	FX	Swap	2	Option	5
5	FX	Swap	2	Swap	6
6	IR	Option	3	Swap	7

连接

s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

pd.concat([s1, s2, s3], axis=1, sort=False)

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

pd.concat([s1, s4], axis=1, join='inner')

	0	1

pd.concat([s1, s2, s3], keys=['one', 'two', 'three'])

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

连接DataFrame

df1 = pd.DataFrame( np.arange(12).reshape(3,4), 
                    columns=['a','b','c','d'])
df1

	a	b	c	d
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11

df2 = pd.DataFrame( np.arange(6).reshape(2,3),
                    columns=['b','d','a'])
df2

	b	d	a
0	0	1	2
1	3	4	5

pd.concat([df1, df2])

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.

	a	b	c	d
0	0	1	2.0	3
1	4	5	6.0	7
2	8	9	10.0	11
0	2	0	NaN	1
1	5	3	NaN	4

pd.concat([df1, df2], ignore_index=True)

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.

	a	b	c	d
0	0	1	2.0	3
1	4	5	6.0	7
2	8	9	10.0	11
3	2	0	NaN	1
4	5	3	NaN	4

数据表的重塑和透视

重塑

symbol = ['JD', 'AAPL']
data = {'行业': ['电商', '科技'],
        '价格': [25.95, 172.97],
        '交易量': [27113291, 18913154]}
df = pd.DataFrame( data, index=symbol )
df.columns.name = '特征'
df.index.name = '代号'
df

特征	行业	价格	交易量
代号
JD	电商	25.95	27113291
AAPL	科技	172.97	18913154

df.stack()  # 列索引 → 行索引

代号    特征 
JD    行业           电商
      价格        25.95
      交易量    27113291
AAPL  行业           科技
      价格       172.97
      交易量    18913154
dtype: object

df.unstack()

特征   代号  
行业   JD            电商
     AAPL          科技
价格   JD         25.95
     AAPL      172.97
交易量  JD      27113291
     AAPL    18913154
dtype: object

df.describe()

特征	价格	交易量
count	2.000000	2.000000e+00
mean	99.460000	2.301322e+07
std	103.958839	5.798372e+06
min	25.950000	1.891315e+07
25%	62.705000	2.096319e+07
50%	99.460000	2.301322e+07
75%	136.215000	2.506326e+07
max	172.970000	2.711329e+07

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, JD to AAPL
Data columns (total 3 columns):
行业     2 non-null object
价格     2 non-null float64
交易量    2 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 144.0+ bytes

完~

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

数据表的创建

Series

DataFrame

查看DataFrame

统计DataFrame

升维DataFrame

三维 Panel

数据表的存载

Excel格式

csv格式

数据表的索引和切片

索引单元素

切片 columns

切片 index

切片 index 和 columns

高级索引

数据表的合并和连接

合并

多键合并

连接

连接DataFrame

数据表的重塑和透视

重塑

猜你喜欢

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0