import pandas as pd
示例1:依据一组key合并
left1 = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']
})
right1 = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']
})
left1
|
A |
B |
key |
0 |
A0 |
B0 |
K0 |
1 |
A1 |
B1 |
K1 |
2 |
A2 |
B2 |
K2 |
3 |
A3 |
B3 |
K3 |
right1
|
C |
D |
key |
0 |
C0 |
D0 |
K0 |
1 |
C1 |
D1 |
K1 |
2 |
C2 |
D2 |
K2 |
3 |
C3 |
D3 |
K3 |
pd.merge(left1, right1, on='key')
|
A |
B |
key |
C |
D |
0 |
A0 |
B0 |
K0 |
C0 |
D0 |
1 |
A1 |
B1 |
K1 |
C1 |
D1 |
2 |
A2 |
B2 |
K2 |
C2 |
D2 |
3 |
A3 |
B3 |
K3 |
C3 |
D3 |
示例2:依据两组key合并
** 合并时有4种方法how = [‘left’, ‘right’, ‘outer’, ‘inner’],预设值how=‘inner’ **
left2 = pd.DataFrame({
'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']
})
right2 = pd.DataFrame({
'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']
})
left2
|
A |
B |
key1 |
key2 |
0 |
A0 |
B0 |
K0 |
K0 |
1 |
A1 |
B1 |
K0 |
K1 |
2 |
A2 |
B2 |
K1 |
K0 |
3 |
A3 |
B3 |
K2 |
K1 |
right2
|
C |
D |
key1 |
key2 |
0 |
C0 |
D0 |
K0 |
K0 |
1 |
C1 |
D1 |
K1 |
K0 |
2 |
C2 |
D2 |
K1 |
K0 |
3 |
C3 |
D3 |
K2 |
K0 |
pd.merge(left2, right2, on=['key1', 'key2'], how='inner')
|
A |
B |
key1 |
key2 |
C |
D |
0 |
A0 |
B0 |
K0 |
K0 |
C0 |
D0 |
1 |
A2 |
B2 |
K1 |
K0 |
C1 |
D1 |
2 |
A2 |
B2 |
K1 |
K0 |
C2 |
D2 |
pd.merge(left2, right2, on=['key1', 'key2'], how='outer')
|
A |
B |
key1 |
key2 |
C |
D |
0 |
A0 |
B0 |
K0 |
K0 |
C0 |
D0 |
1 |
A1 |
B1 |
K0 |
K1 |
NaN |
NaN |
2 |
A2 |
B2 |
K1 |
K0 |
C1 |
D1 |
3 |
A2 |
B2 |
K1 |
K0 |
C2 |
D2 |
4 |
A3 |
B3 |
K2 |
K1 |
NaN |
NaN |
5 |
NaN |
NaN |
K2 |
K0 |
C3 |
D3 |
pd.merge(left2, right2, on=['key1', 'key2'], how='left')
|
A |
B |
key1 |
key2 |
C |
D |
0 |
A0 |
B0 |
K0 |
K0 |
C0 |
D0 |
1 |
A1 |
B1 |
K0 |
K1 |
NaN |
NaN |
2 |
A2 |
B2 |
K1 |
K0 |
C1 |
D1 |
3 |
A2 |
B2 |
K1 |
K0 |
C2 |
D2 |
4 |
A3 |
B3 |
K2 |
K1 |
NaN |
NaN |
pd.merge(left2, right2, on=['key1', 'key2'], how='right')
|
A |
B |
key1 |
key2 |
C |
D |
0 |
A0 |
B0 |
K0 |
K0 |
C0 |
D0 |
1 |
A2 |
B2 |
K1 |
K0 |
C1 |
D1 |
2 |
A2 |
B2 |
K1 |
K0 |
C2 |
D2 |
3 |
NaN |
NaN |
K2 |
K0 |
C3 |
D3 |
分割线 与left_index 对比
pd.merge(left2, right2, on=['key1', 'key2'], left_index=True, right_index=True, how='right')
|
A |
B |
key1 |
key2 |
C |
D |
0 |
A0 |
B0 |
K0 |
K0 |
C0 |
D0 |
1 |
A1 |
B1 |
K0 |
K1 |
C1 |
D1 |
2 |
A2 |
B2 |
K1 |
K0 |
C2 |
D2 |
3 |
A3 |
B3 |
K2 |
K1 |
C3 |
D3 |
pd.merge(left2, right2, left_index=True, right_index=True, how='right')
|
A |
B |
key1_x |
key2_x |
C |
D |
key1_y |
key2_y |
0 |
A0 |
B0 |
K0 |
K0 |
C0 |
D0 |
K0 |
K0 |
1 |
A1 |
B1 |
K0 |
K1 |
C1 |
D1 |
K1 |
K0 |
2 |
A2 |
B2 |
K1 |
K0 |
C2 |
D2 |
K1 |
K0 |
3 |
A3 |
B3 |
K2 |
K1 |
C3 |
D3 |
K2 |
K0 |
right2.index
RangeIndex(start=0, stop=4, step=1)
left2.index
RangeIndex(start=0, stop=4, step=1)
示例3:Indicator
** indicator=True会将合并的记录放在新的一列 **
left3 = pd.DataFrame({
'col1': [0, 1],
'col_left': ['a', 'b']
})
right3 = pd.DataFrame({
'col1': [1, 2, 2],
'col_right': [2, 2, 2]
})
left3
|
col1 |
col_left |
0 |
0 |
a |
1 |
1 |
b |
right3
|
col1 |
col_right |
0 |
1 |
2 |
1 |
2 |
2 |
2 |
2 |
2 |
pd.merge(left3, right3, on='col1', how='outer', indicator=True)
|
col1 |
col_left |
col_right |
_merge |
0 |
0 |
a |
NaN |
left_only |
1 |
1 |
b |
2.0 |
both |
2 |
2 |
NaN |
2.0 |
right_only |
3 |
2 |
NaN |
2.0 |
right_only |
pd.merge(left3, right3, on='col1', how='outer', indicator='indicator_column')
|
col1 |
col_left |
col_right |
indicator_column |
0 |
0 |
a |
NaN |
left_only |
1 |
1 |
b |
2.0 |
both |
2 |
2 |
NaN |
2.0 |
right_only |
3 |
2 |
NaN |
2.0 |
right_only |
示例4:依据index合并
left4 = pd.DataFrame(
{
'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']
},
index=['K0', 'K1', 'K2']
)
right4 = pd.DataFrame(
{
'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']
},
index=['K0', 'K2', 'K3']
)
left4
|
A |
B |
K0 |
A0 |
B0 |
K1 |
A1 |
B1 |
K2 |
A2 |
B2 |
right4
|
C |
D |
K0 |
C0 |
D0 |
K2 |
C2 |
D2 |
K3 |
C3 |
D3 |
left4.index
Index(['K0', 'K1', 'K2'], dtype='object')
right4.index
Index(['K0', 'K2', 'K3'], dtype='object')
pd.merge(left4, right4, left_index=True, right_index=True, how='outer')
|
A |
B |
C |
D |
K0 |
A0 |
B0 |
C0 |
D0 |
K1 |
A1 |
B1 |
NaN |
NaN |
K2 |
A2 |
B2 |
C2 |
D2 |
K3 |
NaN |
NaN |
C3 |
D3 |
pd.merge(left4, right4, left_index=True, right_index=True, how='inner')
|
A |
B |
C |
D |
K0 |
A0 |
B0 |
C0 |
D0 |
K2 |
A2 |
B2 |
C2 |
D2 |
pd.merge(left4, right4, left_index=True, right_index=True, how='inner')
|
A |
B |
C |
D |
K0 |
A0 |
B0 |
C0 |
D0 |
K2 |
A2 |
B2 |
C2 |
D2 |
示例5:解决overlapping的问题
left5 = pd.DataFrame({
'k': ['K0', 'K1', 'K2'],
'age': [1, 2, 3]
})
right5 = pd.DataFrame({
'k': ['K0', 'K0', 'K3'],
'age': [4, 5, 6]
})
left5
|
age |
k |
0 |
1 |
K0 |
1 |
2 |
K1 |
2 |
3 |
K2 |
right5
|
age |
k |
0 |
4 |
K0 |
1 |
5 |
K0 |
2 |
6 |
K3 |
pd.merge(left5, right5, on='k', suffixes=['_left5', '_right5'], how='inner')
|
age_left5 |
k |
age_right5 |
0 |
1 |
K0 |
4 |
1 |
1 |
K0 |
5 |
left5.columns
Index(['age', 'k'], dtype='object')
left5.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
age 3 non-null int64
k 3 non-null object
dtypes: int64(1), object(1)
memory usage: 128.0+ bytes
left5.describe().T
|
count |
mean |
std |
min |
25% |
50% |
75% |
max |
age |
3.0 |
2.0 |
1.0 |
1.0 |
1.5 |
2.0 |
2.5 |
3.0 |
参考链接
Pandas 合并 merge
PANDAS 数据合并与重塑(join/merge篇)