pandas data index, join, join and reshape

Self-study study notes: pandas index splicing and reshaping
1, hierarchical index
2, joint and merged data sets
3, reshaping and perspective

1 hierarchical index

import pandas as pd
import numpy as np

generate data

data = pd.Series(np.random.randn(9),index=[['a','a','a','b','b','c','c','d','d'],[1,2,3,1,3,1,2,2,3]])
data
a  1    0.269045
   2   -0.885298
   3    0.158806
b  1    0.378847
   3    0.301012
c  1   -1.255018
   2   -1.453824
d  2   -0.088124
   3   -1.746533
dtype: float64
data.index
MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

hierarchical index

data['b']
1    0.378847
3    0.301012
dtype: float64
data['b':'c']
b  1    0.378847
   3    0.301012
c  1   -1.255018
   2   -1.453824
dtype: float64
data.loc[['b','c']]
b  1    0.378847
   3    0.301012
c  1   -1.255018
   2   -1.453824
dtype: float64

Make selections in internal hierarchies

data.loc[:, 2]
a   -0.885298
c   -1.453824
d   -0.088124
dtype: float64

Use unstack to rearrange the data

data.unstack()
1 2 3
a 0.269045 -0.885298 0.158806
b 0.378847 NaN 0.301012
c -1.255018 -1.453824 NaN
d NaN -0.088124 -1.746533

The inverse operation of unstack is stack

data.unstack().stack()
a  1    0.269045
   2   -0.885298
   3    0.158806
b  1    0.378847
   3    0.301012
c  1   -1.255018
   2   -1.453824
d  2   -0.088124
   3   -1.746533
dtype: float64

hierarchical index

frame = pd.DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio','Ohio','Colorado'],['Green','Red','Gerrn']])
frame
Ohio Colorado
Green Red Gerrn
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11

add index name

frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame
state Ohio Colorado
color Green Red Gerrn
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11

column index

frame['Ohio']
color Green Red
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10
from pandas import MultiIndex

Use MultiIndex to create columns with hierarchies

MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],['Green','Red','Gerrn']],names=['state','color'])
MultiIndex([(    'Ohio', 'Green'),
            (    'Ohio',   'Red'),
            ('Colorado', 'Gerrn')],
           names=['state', 'color'])

1.1 Reordering and hierarchical sorting

frame.swaplevel('key1','key2')
state Ohio Colorado
color Green Red Gerrn
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
frame.sort_index(level=1)
state Ohio Colorado
color Green Red Gerrn
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11
frame.swaplevel(0,1).sort_index(level=0)
state Ohio Colorado
color Green Red Gerrn
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11

1.2 Summary statistics by level

frame.sum(level='key2')
state Ohio Colorado
color Green Red Gerrn
key2
1 6 8 10
2 12 14 16
frame.sum(level='color',axis=1)
color Green Red Gerrn
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11

1.3 Using columns for indexing

frame = pd.DataFrame({
    
    'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
frame
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3
frame2 = frame.set_index(['c','d'])
frame2
a b
c d
one 0 0 7
1 1 6
2 2 5
two 0 3 4
1 4 3
2 5 2
3 6 1
frame.set_index(['c','d'],drop=False)
a b c d
c d
one 0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
two 0 3 4 two 0
1 4 3 two 1
2 5 2 two 2
3 6 1 two 3
frame2.reset_index()
c d a b
0 one 0 0 7
1 one 1 1 6
2 one 2 2 5
3 two 0 3 4
4 two 1 4 3
5 two 2 5 2
6 two 3 6 1

2 Union and Merge Datasets

2.1 DateFrame connection of database division

df1 = pd.DataFrame({
    
    'key':['b','b','a','c','a','a','b'],'data1':range(7)})
df1
key data1
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 a 5
6 b 6
df2 = pd.DataFrame({
    
    'key':['a','b','d'],'data2':range(3)})
df2
key data2
0 a 0
1 b 1
2 d 2
pd.merge(df1,df2)
key data1 data2
0 b 0 1
1 b 1 1
2 b 6 1
3 a 2 0
4 a 4 0
5 a 5 0
pd.merge(df1,df2,on='key')
key data1 data2
0 b 0 1
1 b 1 1
2 b 6 1
3 a 2 0
4 a 4 0
5 a 5 0
df3 = pd.DataFrame({
    
    'lkey':['b','b','a','c','a','a','b'],'data1':range(7)})
df3
lkey data1
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 a 5
6 b 6
df4 = pd.DataFrame({
    
    'rkey':['a','b','d'],'data2':range(3)})
df4
rkey data2
0 a 0
1 b 1
2 d 2
pd.merge(df3,df4,left_on='lkey',right_on='rkey')
lkey data1 rkey data2
0 b 0 b 1
1 b 1 b 1
2 b 6 b 1
3 a 2 a 0
4 a 4 a 0
5 a 5 a 0
pd.merge(df1,df2,how='outer')
key data1 data2
0 b 0.0 1.0
1 b 1.0 1.0
2 b 6.0 1.0
3 a 2.0 0.0
4 a 4.0 0.0
5 a 5.0 0.0
6 c 3.0 NaN
7 d NaN 2.0
df1 = pd.DataFrame({
    
    'key':['b','b','a','c','a','b'],'data1':range(6)})
df1
key data1
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 b 5
df2 = pd.DataFrame({
    
    'key':['a','b','a','b','d'],'data2':range(5)})
df2
key data2
0 a 0
1 b 1
2 a 2
3 b 3
4 d 4
pd.merge(df1,df2,on='key',how='left')
key data1 data2
0 b 0 1.0
1 b 0 3.0
2 b 1 1.0
3 b 1 3.0
4 a 2 0.0
5 a 2 2.0
6 c 3 NaN
7 a 4 0.0
8 a 4 2.0
9 b 5 1.0
10 b 5 3.0
pd.merge(df1,df2,how='inner')
key data1 data2
0 b 0 1
1 b 0 3
2 b 1 1
3 b 1 3
4 b 5 1
5 b 5 3
6 a 2 0
7 a 2 2
8 a 4 0
9 a 4 2
left = pd.DataFrame({
    
    'key1':['foo','foo','bar'],'key2':['one','two','one'],'lval':[1,2,3]})
left
key1 key2 lval
0 foo one 1
1 foo two 2
2 bar one 3
right = pd.DataFrame({
    
    'key1':['foo','foo','bar','bar'],'key2':['one','one','one','two'],'rval':[4,5,6,7]})
right
key1 key2 rval
0 foo one 4
1 foo one 5
2 bar one 6
3 bar two 7
pd.merge(left,right,on=['key1','key2'],how='outer')
key1 key2 lval rval
0 foo one 1.0 4.0
1 foo one 1.0 5.0
2 foo two 2.0 NaN
3 bar one 3.0 6.0
4 bar two NaN 7.0
pd.merge(left,right,on='key1')
key1 key2_x lval key2_y rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
pd.merge(left,right,on='key1',suffixes=('_left','_right'))
key1 key2_left lval key2_right rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7

2.2 Merge according to the index

left1 = pd.DataFrame({
    
    'key':['a','b','a','a','b','c'],'value':range(6)})
left1
key value
0 a 0
1 b 1
2 a 2
3 a 3
4 b 4
5 c 5
right1 = pd.DataFrame({
    
    'group_val':[3.5,7]},index=['a','b'])
right1
group_val
a 3.5
b 7.0
pd.merge(left1,right1,left_on='key',right_index=True)
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
pd.merge(left1,right1,left_on='key',right_index=True,how='outer')
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
5 c 5 NaN
lefth = pd.DataFrame({
    
    'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],'key2':[2000,2001,2002,2001,2002],'data':np.arange(5.)})
lefth
key1 key2 data
0 Ohio 2000 0.0
1 Ohio 2001 1.0
2 Ohio 2002 2.0
3 Nevada 2001 3.0
4 Nevada 2002 4.0
righth = pd.DataFrame(np.arange(12).reshape((6,2)),index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],[2001,2000,2000,2000,2001,2002]],columns=['event1','event2'])
righth
event1 event2
Nevada 2001 0 1
2000 2 3
Ohio 2000 4 5
2000 6 7
2001 8 9
2002 10 11
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)
key1 key2 data event1 event2
0 Ohio 2000 0.0 4 5
0 Ohio 2000 0.0 6 7
1 Ohio 2001 1.0 8 9
2 Ohio 2002 2.0 10 11
3 Nevada 2001 3.0 0 1
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer')
key1 key2 data event1 event2
0 Ohio 2000 0.0 4.0 5.0
0 Ohio 2000 0.0 6.0 7.0
1 Ohio 2001 1.0 8.0 9.0
2 Ohio 2002 2.0 10.0 11.0
3 Nevada 2001 3.0 0.0 1.0
4 Nevada 2002 4.0 NaN NaN
4 Nevada 2000 NaN 2.0 3.0

Merge using indexes on both sides

left2 = pd.DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],columns=['Ohio','Nevada'])
left2
Ohio Nevada
a 1.0 2.0
c 3.0 4.0
e 5.0 6.0
right2 = pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],index=['b','c','d','e'],columns=['Missouri','Alabama'])
right2
Missouri Alabama
b 7.0 8.0
c 9.0 10.0
d 11.0 12.0
e 13.0 14.0
pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
left2.join(right2,how='outer')
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
left1.join(right1,on='key')
key value group_val
0 a 0 3.5
1 b 1 7.0
2 a 2 3.5
3 a 3 3.5
4 b 4 7.0
5 c 5 NaN
another = pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17.]],index=['a','c','e','f'],columns=['New York','Oregon'])
another
New York Oregon
a 7.0 8.0
c 9.0 10.0
e 11.0 12.0
f 16.0 17.0
left2.join([right2,another])
Ohio Nevada Missouri Alabama New York Oregon
a 1.0 2.0 NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0 9.0 10.0
e 5.0 6.0 13.0 14.0 11.0 12.0
left2.join([right2,another],how='outer')
Ohio Nevada Missouri Alabama New York Oregon
a 1.0 2.0 NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0 9.0 10.0
e 5.0 6.0 13.0 14.0 11.0 12.0
b NaN NaN 7.0 8.0 NaN NaN
d NaN NaN 11.0 12.0 NaN NaN
f NaN NaN NaN NaN 16.0 17.0

2.3 Axial connection

arr = np.arange(12).reshape((3,4))
arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
np.concatenate([arr,arr],axis=1)
array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])
s1 = pd.Series([0,1],index=['a','b'])
s1
a    0
b    1
dtype: int64
s2 = pd.Series([2,3,4],index=['c','d','e'])
s2
c    2
d    3
e    4
dtype: int64
s3 = pd.Series([5,6],index=['f','g'])
s3
f    5
g    6
dtype: int64
pd.concat([s1,s2,s3])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
pd.concat([s1,s2,s3],axis=1)#列
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
s4 = pd.concat([s1,s3])
s4
a    0
b    1
f    5
g    6
dtype: int64
pd.concat([s1,s4],axis=1)
0 1
a 0.0 0
b 1.0 1
f NaN 5
g NaN 6
pd.concat([s1,s4],axis=1,join='inner')
0 1
a 0 0
b 1 1
result = pd.concat([s1,s1,s3],keys=['one','two','three'])
result
one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64
result.unstack()
a b f g
one 0.0 1.0 NaN NaN
two 0.0 1.0 NaN NaN
three NaN NaN 5.0 6.0
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])
one two three
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
df1 = pd.DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
df1
one two
a 0 1
b 2 3
c 4 5
df2 = pd.DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
df2
three four
a 5 6
c 7 8
pd.concat([df1,df2],axis=1,keys=['level1','level2'])
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
pd.concat({
    
    'level1':df1,'level2':df2},axis=1)
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower'])
upper level1 level2
lower one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
df1 = pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df1
a b c d
0 0.030306 -0.665932 -0.180251 -0.562281
1 -0.562836 -1.362955 -1.928334 -0.991793
2 -1.382119 0.695608 0.638588 0.812288
df2 = pd.DataFrame(np.random.randn(2,3),columns=['b','d','a'])
df2
b d a
0 0.323228 -1.229234 2.685688
1 -1.968068 -1.996343 -1.134667
pd.concat([df1,df2],ignore_index=True)
a b c d
0 0.030306 -0.665932 -0.180251 -0.562281
1 -0.562836 -1.362955 -1.928334 -0.991793
2 -1.382119 0.695608 0.638588 0.812288
3 2.685688 0.323228 NaN -1.229234
4 -1.134667 -1.968068 NaN -1.996343

2.4 Joint overlapping data

a = pd.Series([np.nan,2.5,0.0,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])
a
f    NaN
e    2.5
d    0.0
c    3.5
b    4.5
a    NaN
dtype: float64
b = pd.Series([0.,np.nan,2.,np.nan,np.nan,5.],index=['a','b','c','d','e','f'])
b
a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64
np.where(pd.isnull(a),b,a)
array([0. , 2.5, 0. , 3.5, 4.5, 5. ])
b.combine_first(a)
a    0.0
b    4.5
c    2.0
d    0.0
e    2.5
f    5.0
dtype: float64
df1 = pd.DataFrame({
    
    'a':[1.,np.nan,5.,np.nan],'b':[np.nan,2.,np.nan,6.],'c':range(2,18,4)})
df1
a b c
0 1.0 NaN 2
1 NaN 2.0 6
2 5.0 NaN 10
3 NaN 6.0 14
df2 = pd.DataFrame({
    
    'a':[5.,4.,np.nan,3.,7.],'b':[np.nan,3.,4.,6.,8.]})
df2
a b
0 5.0 NaN
1 4.0 3.0
2 NaN 4.0
3 3.0 6.0
4 7.0 8.0
df1.combine_first(df2)
a b c
0 1.0 NaN 2.0
1 4.0 2.0 6.0
2 5.0 4.0 10.0
3 3.0 6.0 14.0
4 7.0 8.0 NaN

3. Reshape and Perspective

3.1使用多层索引进行重塑

data = pd.DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['Ohio','Colorado'],name='state'),columns=pd.Index(['one','two','three'],name='number'))
data
number one two three
state
Ohio 0 1 2
Colorado 3 4 5

stack将列透视到行

result = data.stack()
result
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32
result.unstack()
number one two three
state
Ohio 0 1 2
Colorado 3 4 5
result.unstack(0)
state Ohio Colorado
number
one 0 3
two 1 4
three 2 5
result.unstack('state')
state Ohio Colorado
number
one 0 3
two 1 4
three 2 5
s1 = pd.Series([0,1,2,3],index=['a','b','c','d'])
s1
a    0
b    1
c    2
d    3
dtype: int64
s2 = pd.Series([4,5,6],index=['c','d','e'])
s2
c    4
d    5
e    6
dtype: int64
data2 = pd.concat([s1,s2],keys=['one','two'])
data2
one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64
data2.unstack()
a b c d e
one 0.0 1.0 2.0 3.0 NaN
two NaN NaN 4.0 5.0 6.0
data2.unstack().stack()
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64
data2.unstack().stack(dropna=False)
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64
df = pd.DataFrame({
    
    'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'))
df
side left right
state number
Ohio one 0 5
two 1 6
three 2 7
Colorado one 3 8
two 4 9
three 5 10
df.unstack('state')
side left right
state Ohio Colorado Ohio Colorado
number
one 0 3 5 8
two 1 4 6 9
three 2 5 7 10
df.unstack('state').stack('side')
state Colorado Ohio
number side
one left 3 0
right 8 5
two left 4 1
right 9 6
three left 5 2
right 10 7

3.2将“长”透视为“宽”

data = pd.read_csv('examples/macrodata.csv')
data.head()
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint
0 1959 1 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 5.8 177.146 0.00 0.00
1 1959 2 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 5.1 177.830 2.34 0.74
2 1959 3 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 5.3 178.657 2.74 1.09
3 1959 4 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 5.6 179.386 0.27 4.06
4 1960 1 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 5.2 180.007 2.31 1.19
periods = pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date')
columns = pd.Index(['realgdp','infl','unemp'],name='item')
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D','end')
ldata = data.stack().reset_index().rename(columns={
    
    0: 'value'})
ldata[:10]
date item value
0 1959-03-31 23:59:59.999999999 realgdp 2710.349
1 1959-03-31 23:59:59.999999999 infl 0.000
2 1959-03-31 23:59:59.999999999 unemp 5.800
3 1959-06-30 23:59:59.999999999 realgdp 2778.801
4 1959-06-30 23:59:59.999999999 infl 2.340
5 1959-06-30 23:59:59.999999999 unemp 5.100
6 1959-09-30 23:59:59.999999999 realgdp 2775.488
7 1959-09-30 23:59:59.999999999 infl 2.740
8 1959-09-30 23:59:59.999999999 unemp 5.300
9 1959-12-31 23:59:59.999999999 realgdp 2785.204
pivoted = ldata.pivot('date','item','value')
pivoted
item infl realgdp unemp
date
1959-03-31 23:59:59.999999999 0.00 2710.349 5.8
1959-06-30 23:59:59.999999999 2.34 2778.801 5.1
1959-09-30 23:59:59.999999999 2.74 2775.488 5.3
1959-12-31 23:59:59.999999999 0.27 2785.204 5.6
1960-03-31 23:59:59.999999999 2.31 2847.699 5.2
... ... ... ...
2008-09-30 23:59:59.999999999 -3.16 13324.600 6.0
2008-12-31 23:59:59.999999999 -8.79 13141.920 6.9
2009-03-31 23:59:59.999999999 0.94 12925.410 8.1
2009-06-30 23:59:59.999999999 3.37 12901.504 9.2
2009-09-30 23:59:59.999999999 3.56 12990.341 9.6

203 rows × 3 columns

ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]
date item value value2
0 1959-03-31 23:59:59.999999999 realgdp 2710.349 1.417751
1 1959-03-31 23:59:59.999999999 infl 0.000 -0.843318
2 1959-03-31 23:59:59.999999999 unemp 5.800 0.752850
3 1959-06-30 23:59:59.999999999 realgdp 2778.801 1.481983
4 1959-06-30 23:59:59.999999999 infl 2.340 0.912075
5 1959-06-30 23:59:59.999999999 unemp 5.100 -0.530813
6 1959-09-30 23:59:59.999999999 realgdp 2775.488 1.443973
7 1959-09-30 23:59:59.999999999 infl 2.740 1.977522
8 1959-09-30 23:59:59.999999999 unemp 5.300 0.621740
9 1959-12-31 23:59:59.999999999 realgdp 2785.204 -0.379998
pivoted = ldata.pivot('date','item')
pivoted[:5]
value value2
item infl realgdp unemp infl realgdp unemp
date
1959-03-31 23:59:59.999999999 0.00 2710.349 5.8 -0.843318 1.417751 0.752850
1959-06-30 23:59:59.999999999 2.34 2778.801 5.1 0.912075 1.481983 -0.530813
1959-09-30 23:59:59.999999999 2.74 2775.488 5.3 1.977522 1.443973 0.621740
1959-12-31 23:59:59.999999999 0.27 2785.204 5.6 1.356540 -0.379998 -0.738881
1960-03-31 23:59:59.999999999 2.31 2847.699 5.2 1.273626 -1.512028 -0.234333
pivoted['value'][:5]
item infl realgdp unemp
date
1959-03-31 23:59:59.999999999 0.00 2710.349 5.8
1959-06-30 23:59:59.999999999 2.34 2778.801 5.1
1959-09-30 23:59:59.999999999 2.74 2775.488 5.3
1959-12-31 23:59:59.999999999 0.27 2785.204 5.6
1960-03-31 23:59:59.999999999 2.31 2847.699 5.2
unstacked = ldata.set_index(['date','item']).unstack('item')
unstacked[:7]
value value2
item infl realgdp unemp infl realgdp unemp
date
1959-03-31 23:59:59.999999999 0.00 2710.349 5.8 -0.843318 1.417751 0.752850
1959-06-30 23:59:59.999999999 2.34 2778.801 5.1 0.912075 1.481983 -0.530813
1959-09-30 23:59:59.999999999 2.74 2775.488 5.3 1.977522 1.443973 0.621740
1959-12-31 23:59:59.999999999 0.27 2785.204 5.6 1.356540 -0.379998 -0.738881
1960-03-31 23:59:59.999999999 2.31 2847.699 5.2 1.273626 -1.512028 -0.234333
1960-06-30 23:59:59.999999999 0.14 2834.390 5.2 1.304159 -0.886630 -0.178589
1960-09-30 23:59:59.999999999 2.70 2839.022 5.6 0.120983 -0.193590 0.747989

3.3将“宽”透视为“长”

df = pd.DataFrame({
    
    'key':['foo','bar','baz'],'A':[1,2,3],'B':[4,5,6],'C':[7,8,9]})
df
key A B C
0 foo 1 4 7
1 bar 2 5 8
2 baz 3 6 9
melted = pd.melt(df,['key'])
melted
key variable value
0 foo A 1
1 bar A 2
2 baz A 3
3 foo B 4
4 bar B 5
5 baz B 6
6 foo C 7
7 bar C 8
8 baz C 9
reshaped = melted.pivot('key','variable','value')
reshaped
variable A B C
key
bar 2 5 8
baz 3 6 9
foo 1 4 7
reshaped.reset_index()
variable key A B C
0 bar 2 5 8
1 baz 3 6 9
2 foo 1 4 7
pd.melt(df,value_vars=['A','B','C'])
variable value
0 A 1
1 A 2
2 A 3
3 B 4
4 B 5
5 B 6
6 C 7
7 C 8
8 C 9
pd.melt(df,value_vars=['key','A','B'])
variable value
0 key foo
1 key bar
2 key baz
3 A 1
4 A 2
5 A 3
6 B 4
7 B 5
8 B 6

Guess you like

Origin blog.csdn.net/weixin_43155435/article/details/129265961
Recommended