#coding=gbk
#数据处理
import numpy as np
import pandas as pd
#合并操作
frame1=pd.DataFrame({'id':['pencil','pen','book','smug'],
'price':[12,14,23.5,66]})
print(frame1)
# id price
# 0 pencil 12.0
# 1 pen 14.0
# 2 book 23.5
# 3 smug 66.0
frame2=pd.DataFrame({'id':['pencil','pencil','book','ball'],
'color':['red','blue','white','yellow']})
print(frame2)
# color id
# 0 red pencil
# 1 blue pencil
# 2 white book
# 3 yellow ball
frame3=pd.merge(frame1,frame2,on='id') #合并操作merge(),将其共同的部分进行合并,on指定列进行合并
print(frame3)
# id price color
# 0 pencil 12.0 red
# 1 pencil 12.0 blue
# 2 book 23.5 white
#2个DataFrame 基准列不同情况下
f1=pd.DataFrame({'sid':['pencil','pen','book','smug'],
'number':[12,14,23.5,66]})
print(f1)
print(pd.merge(f1,frame2,left_on='sid',right_on='id')) #在 id 和 sid 具有相同元素是,合并
# number sid color id
# 0 12.0 pencil red pencil
# 1 12.0 pencil blue pencil
# 2 23.5 book white book
#上述都是内连接,设置选项可以使用how ,有左连接,右连接 和 外连接,其原理与数据库的连接相同
f2=pd.merge(f1,frame2,left_on='sid',right_on='id',how='left')
print(f2)
# number sid color id z左连接,选取左边f1的全部元素,右边frame2没有的使用nan表示。
# 0 12.0 pencil red pencil
# 1 12.0 pencil blue pencil
# 2 14.0 pen NaN NaN
# 3 23.5 book white book
# 4 66.0 smug NaN NaN
print(pd.merge(f1,frame2,left_on='sid',right_on='id',how='outer')) #外连接
# number sid color id
# 0 12.0 pencil red pencil
# 1 12.0 pencil blue pencil
# 2 14.0 pen NaN NaN
# 3 23.5 book white book
# 4 66.0 smug NaN NaN
# 5 NaN NaN yellow ball
#根据索引进行合并
print(pd.merge(frame1,frame2,left_index=True,right_index=True))
# id_x price color id_y
# 0 pencil 12.0 red pencil
# 1 pen 14.0 blue pencil
# 2 book 23.5 white book
# 3 smug 66.0 yellow ball
#print(frame1.join(frame2)) 不能使用,因为有相同的列名称
#2拼接,numpy的concatenate() 函数 进行数组的拼接
array1=np.arange(9).reshape(3,3)
array2=np.arange(9).reshape(3,3)+12
print(np.concatenate([array1,array2],axis=1))#在列上增加
# [[ 0 1 2 12 13 14]
# [ 3 4 5 15 16 17]
# [ 6 7 8 18 19 20]]
print(np.concatenate([array1,array2],axis=0))
# [[ 0 1 2]
# [ 3 4 5]
# [ 6 7 8]
# [12 13 14]
# [15 16 17]
# [18 19 20]]
#pandas库的concat()函数
s1=pd.Series(np.random.rand(4),index=[1,2,3,4])
print(s1)
# 1 0.991280
# 2 0.637284
# 3 0.563105
# 4 0.840878
# dtype: float64
s2=pd.Series(np.random.rand(4),index=[5,6,7,8])
print(s2)
# 5 0.955175
# 6 0.236703
# 7 0.867245
# 8 0.779975
# dtype: float64
print(pd.concat([s1,s2]))
# 1 0.598666
# 2 0.630031
# 3 0.688623
# 4 0.020529
# 5 0.956814
# 6 0.654709
# 7 0.390657
# 8 0.531092
# dtype: float64
print(pd.concat([s1,s2],axis=1))
# 0 1
# 1 0.101299 NaN
# 2 0.403274 NaN
# 3 0.893143 NaN
# 4 0.412365 NaN
# 5 NaN 0.909780
# 6 NaN 0.601977
# 7 NaN 0.227269
# 8 NaN 0.755501
print(pd.concat([s1,s2],axis=1,keys=['part1','part2'])) #key设置拼接的部分
# part1 part2
# 1 0.961096 NaN
# 2 0.513365 NaN
# 3 0.781232 NaN
# 4 0.289688 NaN
# 5 NaN 0.807915
# 6 NaN 0.981938
# 7 NaN 0.366429
# 8 NaN 0.678213
#组合:2个数据集索引部分或者全部重合的情况下 combine_first()
s3=pd.Series(np.random.rand(5),index=[1,2,3,4,5])
print(s3)
# 1 0.165086
# 2 0.344045
# 3 0.354699
# 4 0.283886
# 5 0.084317
# dtype: float64
s4=pd.Series(np.random.rand(4),index=[1,3,4,6])
print(s4)
# 1 0.122579
# 3 0.401560
# 4 0.948493
# 6 0.976377
# dtype: float64
print(s3.combine_first(s4)) #相同的索引以s3的值填充
# 1 0.087640
# 2 0.386778
# 3 0.817685
# 4 0.217306
# 5 0.067610
# 6 0.276078
# dtype: float64
print(s4.combine_first(s3))
# 1 0.326173
# 2 0.386778
# 3 0.615864
# 4 0.805372
# 5 0.067610
# 6 0.276078
# dtype: float64