panda 表合并

import pandas as pd

示例1:依据一组key合并

# 定义资料集并打印出
left1 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})
right1 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
})
left1
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K2
3 A3 B3 K3
right1
C D key
0 C0 D0 K0
1 C1 D1 K1
2 C2 D2 K2
3 C3 D3 K3
pd.merge(left1, right1, on='key')
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3

示例2:依据两组key合并

** 合并时有4种方法how = [‘left’, ‘right’, ‘outer’, ‘inner’],预设值how=‘inner’ **

# 定义资料集并打印出
left2 = pd.DataFrame({
    'key1': ['K0', 'K0', 'K1', 'K2'],
    'key2': ['K0', 'K1', 'K0', 'K1'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})
right2 = pd.DataFrame({
    'key1': ['K0', 'K1', 'K1', 'K2'],
    'key2': ['K0', 'K0', 'K0', 'K0'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
})
left2
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K0
3 A3 B3 K2 K1
right2
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K0
2 C2 D2 K1 K0
3 C3 D3 K2 K0
pd.merge(left2, right2, on=['key1', 'key2'], how='inner')
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
pd.merge(left2, right2, on=['key1', 'key2'], how='outer')
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
pd.merge(left2, right2, on=['key1', 'key2'], how='left')
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
pd.merge(left2, right2, on=['key1', 'key2'], how='right')
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3

分割线 与left_index 对比

# 如果 on 和 left_index=True, right_index=True 同时使用,以 left 的 on 为准
pd.merge(left2, right2, on=['key1', 'key2'], left_index=True, right_index=True,  how='right')
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 C1 D1
2 A2 B2 K1 K0 C2 D2
3 A3 B3 K2 K1 C3 D3
pd.merge(left2, right2, left_index=True, right_index=True,  how='right')
A B key1_x key2_x C D key1_y key2_y
0 A0 B0 K0 K0 C0 D0 K0 K0
1 A1 B1 K0 K1 C1 D1 K1 K0
2 A2 B2 K1 K0 C2 D2 K1 K0
3 A3 B3 K2 K1 C3 D3 K2 K0
right2.index
RangeIndex(start=0, stop=4, step=1)
left2.index
RangeIndex(start=0, stop=4, step=1)

示例3:Indicator

** indicator=True会将合并的记录放在新的一列 **

# 定义资料集并打印出
left3 = pd.DataFrame({
    'col1': [0, 1],
    'col_left': ['a', 'b']
})
right3 = pd.DataFrame({
    'col1': [1, 2, 2],
    'col_right': [2, 2, 2]
})
left3
col1 col_left
0 0 a
1 1 b
right3
col1 col_right
0 1 2
1 2 2
2 2 2
# 依据col1进行合并,并启用indicator=True,最后打印出
pd.merge(left3, right3, on='col1', how='outer', indicator=True)
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
# 自定indicator column的名称,并打印出
pd.merge(left3, right3, on='col1', how='outer', indicator='indicator_column')
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only

示例4:依据index合并

# 定义资料集并打印出
left4 = pd.DataFrame(
    {
        'A': ['A0', 'A1', 'A2'],
        'B': ['B0', 'B1', 'B2']
    },
    index=['K0', 'K1', 'K2']
)
right4 = pd.DataFrame(
    {
        'C': ['C0', 'C2', 'C3'],
        'D': ['D0', 'D2', 'D3']
    },
    index=['K0', 'K2', 'K3']
)
left4
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
right4
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
left4.index
Index(['K0', 'K1', 'K2'], dtype='object')
right4.index
Index(['K0', 'K2', 'K3'], dtype='object')
#依据左右资料集的index进行合并,how='outer',并打印出
pd.merge(left4, right4, left_index=True, right_index=True, how='outer')
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
#依据左右资料集的index进行合并,how='inner',并打印出
pd.merge(left4, right4, left_index=True, right_index=True, how='inner')
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C2 D2
#依据左右资料集的index进行合并,how='inner',并打印出
pd.merge(left4, right4, left_index=True, right_index=True, how='inner')
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C2 D2

示例5:解决overlapping的问题

#定义资料集
left5 = pd.DataFrame({
    'k': ['K0', 'K1', 'K2'], 
    'age': [1, 2, 3]
})
right5 = pd.DataFrame({
    'k': ['K0', 'K0', 'K3'], 
    'age': [4, 5, 6]
})
left5
age k
0 1 K0
1 2 K1
2 3 K2
right5
age k
0 4 K0
1 5 K0
2 6 K3
# age 冲突的情况下,可以使用 suffixes 属性来重命名
pd.merge(left5, right5, on='k',  suffixes=['_left5', '_right5'], how='inner')
age_left5 k age_right5
0 1 K0 4
1 1 K0 5
left5.columns
Index(['age', 'k'], dtype='object')
left5.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
age    3 non-null int64
k      3 non-null object
dtypes: int64(1), object(1)
memory usage: 128.0+ bytes
left5.describe().T
count mean std min 25% 50% 75% max
age 3.0 2.0 1.0 1.0 1.5 2.0 2.5 3.0

参考链接

Pandas 合并 merge

PANDAS 数据合并与重塑(join/merge篇)

猜你喜欢

转载自blog.csdn.net/q370835062/article/details/83780636