Pandas——concat(合并)

1.前言

pandas处理多组数据的时候往往会要用到数据的合并处理,使用 concat是一种基本的合并方式.而且concat中有很多参数可以调整,合并成你想要的数据形式.

2.垂直合并(axis = 0)

axis=0是预设值,因此未设定任何参数时,函数默认axis=0

import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.ones((4,5))*6, columns = ['a','b','c','d','e'])
df2 = pd.DataFrame(np.ones((4,5))*7, columns = ['a','b','c','d','e'])
df3 = pd.DataFrame(np.ones((4,5))*8, columns = ['a','b','c','d','e'])

res = pd.concat([df1,df2,df3],axis = 0)
print(res)

#输出
     a    b    c    d    e
0  6.0  6.0  6.0  6.0  6.0
1  6.0  6.0  6.0  6.0  6.0
2  6.0  6.0  6.0  6.0  6.0
3  6.0  6.0  6.0  6.0  6.0
0  7.0  7.0  7.0  7.0  7.0
1  7.0  7.0  7.0  7.0  7.0
2  7.0  7.0  7.0  7.0  7.0
3  7.0  7.0  7.0  7.0  7.0
0  8.0  8.0  8.0  8.0  8.0
1  8.0  8.0  8.0  8.0  8.0
2  8.0  8.0  8.0  8.0  8.0
3  8.0  8.0  8.0  8.0  8.0

仔细观察会发现结果的index是0, 1, 2, 0, 1, 2, 0, 1, 2,若要将index重置,添加开关ignore_index = True:

import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.ones((4,5))*6, columns = ['a','b','c','d','e'])
df2 = pd.DataFrame(np.ones((4,5))*7, columns = ['a','b','c','d','e'])
df3 = pd.DataFrame(np.ones((4,5))*8, columns = ['a','b','c','d','e'])

res = pd.concat([df1,df2,df3],axis = 0,ignore_index = True)
print(res)

#输出
      a    b    c    d    e
0   6.0  6.0  6.0  6.0  6.0
1   6.0  6.0  6.0  6.0  6.0
2   6.0  6.0  6.0  6.0  6.0
3   6.0  6.0  6.0  6.0  6.0
4   7.0  7.0  7.0  7.0  7.0
5   7.0  7.0  7.0  7.0  7.0
6   7.0  7.0  7.0  7.0  7.0
7   7.0  7.0  7.0  7.0  7.0
8   8.0  8.0  8.0  8.0  8.0
9   8.0  8.0  8.0  8.0  8.0
10  8.0  8.0  8.0  8.0  8.0
11  8.0  8.0  8.0  8.0  8.0

3.join(合并方式)

join='outer’为预设值,因此未设定任何参数时,函数默认join=‘outer’。此方式是依照column来做纵向合并,有相同的column上下合并在一起,其他独自的column个自成列,原本没有值的位置皆以NaN填充。

import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.ones((4,5))*6, columns = ['a','b','c','d','e'])
df2 = pd.DataFrame(np.ones((4,5))*7, columns = ['b','c','d','e','f'])
print(df1)
print('\n')
print(df2)
res = pd.concat([df1,df2],join = 'outer')   #纵向"外"合并df1与df2
print(res)

#输出
     a    b    c    d    e
0  6.0  6.0  6.0  6.0  6.0
1  6.0  6.0  6.0  6.0  6.0
2  6.0  6.0  6.0  6.0  6.0
3  6.0  6.0  6.0  6.0  6.0


     b    c    d    e    f
0  7.0  7.0  7.0  7.0  7.0
1  7.0  7.0  7.0  7.0  7.0
2  7.0  7.0  7.0  7.0  7.0
3  7.0  7.0  7.0  7.0  7.0
     a    b    c    d    e    f
0  6.0  6.0  6.0  6.0  6.0  NaN
1  6.0  6.0  6.0  6.0  6.0  NaN
2  6.0  6.0  6.0  6.0  6.0  NaN
3  6.0  6.0  6.0  6.0  6.0  NaN
0  NaN  7.0  7.0  7.0  7.0  7.0
1  NaN  7.0  7.0  7.0  7.0  7.0
2  NaN  7.0  7.0  7.0  7.0  7.0
3  NaN  7.0  7.0  7.0  7.0  7.0

原理同上个例子的说明,但只有相同的column合并在一起,其他的会被抛弃。

import numpy as np
import pandas as pd

df1 = pd.DataFrame(np.ones((4,5))*6, columns = ['a','b','c','d','e'])
df2 = pd.DataFrame(np.ones((4,5))*7, columns = ['b','c','d','e','f'])

print(df1)
print('\n')
print(df2)
print('\n')
res = pd.concat([df1,df2],axis = 0, join = 'inner')
print(res)
print('\n')
res_sort = pd.concat([df1,df2], axis = 0, join = 'inner',ignore_index = True)  #重置index并打印结果
print(res_sort)

#输出
     a    b    c    d    e
0  6.0  6.0  6.0  6.0  6.0
1  6.0  6.0  6.0  6.0  6.0
2  6.0  6.0  6.0  6.0  6.0
3  6.0  6.0  6.0  6.0  6.0


     b    c    d    e    f
0  7.0  7.0  7.0  7.0  7.0
1  7.0  7.0  7.0  7.0  7.0
2  7.0  7.0  7.0  7.0  7.0
3  7.0  7.0  7.0  7.0  7.0


     b    c    d    e
0  6.0  6.0  6.0  6.0
1  6.0  6.0  6.0  6.0
2  6.0  6.0  6.0  6.0
3  6.0  6.0  6.0  6.0
0  7.0  7.0  7.0  7.0
1  7.0  7.0  7.0  7.0
2  7.0  7.0  7.0  7.0
3  7.0  7.0  7.0  7.0


     b    c    d    e
0  6.0  6.0  6.0  6.0
1  6.0  6.0  6.0  6.0
2  6.0  6.0  6.0  6.0
3  6.0  6.0  6.0  6.0
4  7.0  7.0  7.0  7.0
5  7.0  7.0  7.0  7.0
6  7.0  7.0  7.0  7.0
7  7.0  7.0  7.0  7.0

4.join_axes

import numpy as np
import pandas as pd

df1 = pd.DataFrame(np.ones((4,5))*6,columns = ['a','b','c','d','e'],index = [1,2,3,4])
df2 = pd.DataFrame(np.ones((4,5))*7,columns = ['b','c','d','e','f'],index = [2,3,4,5])

print(df1)
print('\n')
print(df2)
print('\n')
res = pd.concat([df1,df2],axis = 1)    #原始合并,没有的行补充NaN
print(res)    
print('\n')
res_sort = pd.concat([df1,df2],axis = 1,join_axes = [df1.index])   #按照df1的index合并
print(res_sort)

#输出
     a    b    c    d    e
1  6.0  6.0  6.0  6.0  6.0
2  6.0  6.0  6.0  6.0  6.0
3  6.0  6.0  6.0  6.0  6.0
4  6.0  6.0  6.0  6.0  6.0


     b    c    d    e    f
2  7.0  7.0  7.0  7.0  7.0
3  7.0  7.0  7.0  7.0  7.0
4  7.0  7.0  7.0  7.0  7.0
5  7.0  7.0  7.0  7.0  7.0


     a    b    c    d    e    b    c    d    e    f
1  6.0  6.0  6.0  6.0  6.0  NaN  NaN  NaN  NaN  NaN
2  6.0  6.0  6.0  6.0  6.0  7.0  7.0  7.0  7.0  7.0
3  6.0  6.0  6.0  6.0  6.0  7.0  7.0  7.0  7.0  7.0
4  6.0  6.0  6.0  6.0  6.0  7.0  7.0  7.0  7.0  7.0
5  NaN  NaN  NaN  NaN  NaN  7.0  7.0  7.0  7.0  7.0


     a    b    c    d    e    b    c    d    e    f
1  6.0  6.0  6.0  6.0  6.0  NaN  NaN  NaN  NaN  NaN
2  6.0  6.0  6.0  6.0  6.0  7.0  7.0  7.0  7.0  7.0
3  6.0  6.0  6.0  6.0  6.0  7.0  7.0  7.0  7.0  7.0
4  6.0  6.0  6.0  6.0  6.0  7.0  7.0  7.0  7.0  7.0

5.append (添加数据)

append只有纵向合并,没有横向合并。

import numpy as np
import pandas as pd

df1 = pd.DataFrame(np.ones((4,5))*6,columns = ['a','b','c','d','e'],index = [1,2,3,4])
df2 = pd.DataFrame(np.ones((4,5))*7,columns = ['b','c','d','e','f'],index = [2,3,4,5])
df3 = pd.DataFrame(np.ones((4,5))*8,columns = ['c','d','e','f','g'],index = [3,4,5,6])
s1 = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
print(df1)
print(df2)

res_df1_df2 = df1.append(df2, ignore_index = True)    #将df2合并到df1的下面,以及重置index,并打印出结果
print(res_df1_df2)
res_df1_df2_df3 = df1.append([df2,df3],ignore_index = True)   #合并多个df,将df2与df3合并至df1的下面,以及重置index,并打印出结果
print(res_df1_df2_df3)
res_series = df1.append(s1,ignore_index = True)
print(res_series)

#输出
     a    b    c    d    e
1  6.0  6.0  6.0  6.0  6.0
2  6.0  6.0  6.0  6.0  6.0
3  6.0  6.0  6.0  6.0  6.0
4  6.0  6.0  6.0  6.0  6.0
     b    c    d    e    f
2  7.0  7.0  7.0  7.0  7.0
3  7.0  7.0  7.0  7.0  7.0
4  7.0  7.0  7.0  7.0  7.0
5  7.0  7.0  7.0  7.0  7.0
     a    b    c    d    e    f
0  6.0  6.0  6.0  6.0  6.0  NaN
1  6.0  6.0  6.0  6.0  6.0  NaN
2  6.0  6.0  6.0  6.0  6.0  NaN
3  6.0  6.0  6.0  6.0  6.0  NaN
4  NaN  7.0  7.0  7.0  7.0  7.0
5  NaN  7.0  7.0  7.0  7.0  7.0
6  NaN  7.0  7.0  7.0  7.0  7.0
7  NaN  7.0  7.0  7.0  7.0  7.0
      a    b    c    d    e    f    g
0   6.0  6.0  6.0  6.0  6.0  NaN  NaN
1   6.0  6.0  6.0  6.0  6.0  NaN  NaN
2   6.0  6.0  6.0  6.0  6.0  NaN  NaN
3   6.0  6.0  6.0  6.0  6.0  NaN  NaN
4   NaN  7.0  7.0  7.0  7.0  7.0  NaN
5   NaN  7.0  7.0  7.0  7.0  7.0  NaN
6   NaN  7.0  7.0  7.0  7.0  7.0  NaN
7   NaN  7.0  7.0  7.0  7.0  7.0  NaN
8   NaN  NaN  8.0  8.0  8.0  8.0  8.0
9   NaN  NaN  8.0  8.0  8.0  8.0  8.0
10  NaN  NaN  8.0  8.0  8.0  8.0  8.0
11  NaN  NaN  8.0  8.0  8.0  8.0  8.0
     a    b    c    d    e
0  6.0  6.0  6.0  6.0  6.0
1  6.0  6.0  6.0  6.0  6.0
2  6.0  6.0  6.0  6.0  6.0
3  6.0  6.0  6.0  6.0  6.0
4  1.0  2.0  3.0  4.0  5.0
发布了143 篇原创文章 · 获赞 388 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/weixin_37763870/article/details/104911013