代码:
import pandas as pd # merging two df by key/keys # simple example left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) print(left) print(right) res = pd.merge(left, right, on='key') print(res)
运行结果:
A B key 0 A0 B0 K0 1 A1 B1 K1 2 A2 B2 K2 3 A3 B3 K3 C D key 0 C0 D0 K0 1 C1 D1 K1 2 C2 D2 K2 3 C3 D3 K3 A B key C D 0 A0 B0 K0 C0 D0 1 A1 B1 K1 C1 D1 2 A2 B2 K2 C2 D2 3 A3 B3 K3 C3 D3
代码:
# 考虑两个key left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 'key2': ['K0', 'K1', 'K0', 'K1'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], 'key2': ['K0', 'K0', 'K0', 'K0'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) print(left) print(right)
运行结果:
A B key1 key2 0 A0 B0 K0 K0 1 A1 B1 K0 K1 2 A2 B2 K1 K0 3 A3 B3 K2 K1 C D key1 key2 0 C0 D0 K0 K0 1 C1 D1 K1 K0 2 C2 D2 K1 K0 3 C3 D3 K2 K0
代码:
# how = ['left','right','outer','inner'] # 只考虑相同的key:inner res = pd.merge(left, right, on=['key1', 'key2'], how='inner') print("Inner") print(res) # 没有的用NAN填充 res = pd.merge(left, right, on=['key1', 'key2'], how='outer') print("Outer") print(res) # 左边不动,右边NAN填充 res = pd.merge(left, right, on=['key1', 'key2'], how='left') print("Left") print(res) # 右边不动,左边NAN填充 res = pd.merge(left, right, on=['key1', 'key2'], how='right') print("Right") print(res)
运行结果:
Inner A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A2 B2 K1 K0 C1 D1 2 A2 B2 K1 K0 C2 D2 Outer A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A1 B1 K0 K1 NaN NaN 2 A2 B2 K1 K0 C1 D1 3 A2 B2 K1 K0 C2 D2 4 A3 B3 K2 K1 NaN NaN 5 NaN NaN K2 K0 C3 D3 Left A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A1 B1 K0 K1 NaN NaN 2 A2 B2 K1 K0 C1 D1 3 A2 B2 K1 K0 C2 D2 4 A3 B3 K2 K1 NaN NaN Right A B key1 key2 C D 0 A0 B0 K0 K0 C0 D0 1 A2 B2 K1 K0 C1 D1 2 A2 B2 K1 K0 C2 D2 3 NaN NaN K2 K0 C3 D3
代码:
# indicator df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']}) df2 = pd.DataFrame({'col1': [1, 2, 3], 'col_right': [2, 2, 2]}) print(df1) print(df2) res = pd.merge(df1, df2, on='col1', how='outer', indicator=True) print(res) res = pd.merge(df1, df2, on='col1', how='outer', indicator="AAA") print(res)
运行结果:
col1 col_left 0 0 a 1 1 b col1 col_right 0 1 2 1 2 2 2 3 2 col1 col_left col_right _merge 0 0 a NaN left_only 1 1 b 2.0 both 2 2 NaN 2.0 right_only 3 3 NaN 2.0 right_only col1 col_left col_right AAA 0 0 a NaN left_only 1 1 b 2.0 both 2 2 NaN 2.0 right_only 3 3 NaN 2.0 right_only
代码:
# merged by index left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=['K0', 'K1', 'K2']) right = pd.DataFrame({'C': ['C0', 'C1', 'C2'], 'D': ['D0', 'D1', 'D2']}, index=['K0', 'K2', 'K3']) print(left) print(right) res1 = pd.merge(left, right, left_index=True, right_index=True, how='outer') res2 = pd.merge(left, right, left_index=True, right_index=True, how='inner') print(res1) print(res2)
运行结果:
A B K0 A0 B0 K1 A1 B1 K2 A2 B2 C D K0 C0 D0 K2 C1 D1 K3 C2 D2 A B C D K0 A0 B0 C0 D0 K1 A1 B1 NaN NaN K2 A2 B2 C1 D1 K3 NaN NaN C2 D2 A B C D K0 A0 B0 C0 D0 K2 A2 B2 C1 D1
代码:
# handle overlapping # 处理重复 boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]}) girls = pd.DataFrame({'k': ['K0', 'K0', 'K2'], 'age': [4, 5, 6]}) print(boys) print(girls) res = pd.merge(boys, girls, on='k', suffixes=['_boys', '_girls'], how='inner') print(res)
运行结果:
age k 0 1 K0 1 2 K1 2 3 K2 age k 0 4 K0 1 5 K0 2 6 K2 age_boys k age_girls 0 1 K0 4 1 1 K0 5 2 3 K2 6