Python数据分析（5）数据处理：数据合并、拼接和组合

#coding=gbk
#数据处理
import numpy as np
import pandas as pd
#合并操作
frame1=pd.DataFrame({'id':['pencil','pen','book','smug'],
                     'price':[12,14,23.5,66]})
print(frame1)
#        id  price
# 0  pencil   12.0
# 1     pen   14.0
# 2    book   23.5
# 3    smug   66.0
frame2=pd.DataFrame({'id':['pencil','pencil','book','ball'],
                     'color':['red','blue','white','yellow']})
print(frame2)
#     color      id
# 0     red  pencil
# 1    blue  pencil
# 2   white    book
# 3  yellow    ball
frame3=pd.merge(frame1,frame2,on='id')  #合并操作merge（），将其共同的部分进行合并,on指定列进行合并
print(frame3)
#        id  price  color
# 0  pencil   12.0    red
# 1  pencil   12.0   blue
# 2    book   23.5  white
#2个DataFrame 基准列不同情况下
f1=pd.DataFrame({'sid':['pencil','pen','book','smug'],
                     'number':[12,14,23.5,66]})
print(f1)
print(pd.merge(f1,frame2,left_on='sid',right_on='id'))  #在 id 和 sid 具有相同元素是，合并
#    number     sid  color      id
# 0    12.0  pencil    red  pencil
# 1    12.0  pencil   blue  pencil
# 2    23.5    book  white    book
#上述都是内连接，设置选项可以使用how ，有左连接，右连接 和 外连接，其原理与数据库的连接相同
f2=pd.merge(f1,frame2,left_on='sid',right_on='id',how='left')
print(f2)
#    number     sid  color      id    z左连接，选取左边f1的全部元素，右边frame2没有的使用nan表示。
# 0    12.0  pencil    red  pencil
# 1    12.0  pencil   blue  pencil
# 2    14.0     pen    NaN     NaN
# 3    23.5    book  white    book
# 4    66.0    smug    NaN     NaN
print(pd.merge(f1,frame2,left_on='sid',right_on='id',how='outer'))  #外连接
#    number     sid   color      id
# 0    12.0  pencil     red  pencil
# 1    12.0  pencil    blue  pencil
# 2    14.0     pen     NaN     NaN
# 3    23.5    book   white    book
# 4    66.0    smug     NaN     NaN
# 5     NaN     NaN  yellow    ball
#根据索引进行合并
print(pd.merge(frame1,frame2,left_index=True,right_index=True))
#     id_x  price   color    id_y
# 0  pencil   12.0     red  pencil
# 1     pen   14.0    blue  pencil
# 2    book   23.5   white    book
# 3    smug   66.0  yellow    ball
#print(frame1.join(frame2))    不能使用，因为有相同的列名称


#2拼接，numpy的concatenate（） 函数 进行数组的拼接
array1=np.arange(9).reshape(3,3)
array2=np.arange(9).reshape(3,3)+12
print(np.concatenate([array1,array2],axis=1))#在列上增加
# [[ 0  1  2 12 13 14]
#  [ 3  4  5 15 16 17]
#  [ 6  7  8 18 19 20]]
print(np.concatenate([array1,array2],axis=0))
# [[ 0  1  2]
#  [ 3  4  5]
#  [ 6  7  8]
#  [12 13 14]
#  [15 16 17]
#  [18 19 20]]
#pandas库的concat（）函数
s1=pd.Series(np.random.rand(4),index=[1,2,3,4])
print(s1)
# 1    0.991280
# 2    0.637284
# 3    0.563105
# 4    0.840878
# dtype: float64
s2=pd.Series(np.random.rand(4),index=[5,6,7,8])
print(s2)
# 5    0.955175
# 6    0.236703
# 7    0.867245
# 8    0.779975
# dtype: float64
print(pd.concat([s1,s2]))
# 1    0.598666
# 2    0.630031
# 3    0.688623
# 4    0.020529
# 5    0.956814
# 6    0.654709
# 7    0.390657
# 8    0.531092
# dtype: float64
print(pd.concat([s1,s2],axis=1))
#           0         1
# 1  0.101299       NaN
# 2  0.403274       NaN
# 3  0.893143       NaN
# 4  0.412365       NaN
# 5       NaN  0.909780
# 6       NaN  0.601977
# 7       NaN  0.227269
# 8       NaN  0.755501
print(pd.concat([s1,s2],axis=1,keys=['part1','part2'])) #key设置拼接的部分
#       part1     part2
# 1  0.961096       NaN
# 2  0.513365       NaN
# 3  0.781232       NaN
# 4  0.289688       NaN
# 5       NaN  0.807915
# 6       NaN  0.981938
# 7       NaN  0.366429
# 8       NaN  0.678213


#组合：2个数据集索引部分或者全部重合的情况下     combine_first()
s3=pd.Series(np.random.rand(5),index=[1,2,3,4,5])
print(s3)
# 1    0.165086
# 2    0.344045
# 3    0.354699
# 4    0.283886
# 5    0.084317
# dtype: float64
s4=pd.Series(np.random.rand(4),index=[1,3,4,6])
print(s4)
# 1    0.122579
# 3    0.401560
# 4    0.948493
# 6    0.976377
# dtype: float64
print(s3.combine_first(s4)) #相同的索引以s3的值填充
# 1    0.087640
# 2    0.386778
# 3    0.817685
# 4    0.217306
# 5    0.067610
# 6    0.276078
# dtype: float64
print(s4.combine_first(s3))
# 1    0.326173
# 2    0.386778
# 3    0.615864
# 4    0.805372
# 5    0.067610
# 6    0.276078
# dtype: float64

Python数据分析（5）数据处理：数据合并、拼接和组合

猜你喜欢