#coding=gbk
#数据处理
import numpy as np
import pandas as pd
#轴向旋转:按照行重新调整列的顺序,或者相反
frame=pd.DataFrame(np.arange(9).reshape(3,3),
index=['white','blue','black'],
columns=['ball','book','pen'])
print(frame)
# ball book pen
# white 0 1 2
# blue 3 4 5
# black 6 7 8
s=frame.stack()
print(s)
# white ball 0 入栈函数,将DataFrame转换成Series对象
# book 1
# pen 2
# blue ball 3
# book 4
# pen 5
# black ball 6
# book 7
# pen 8
# dtype: int32
f=s.unstack() #出栈函数
print(f)
# ball book pen
# white 0 1 2
# blue 3 4 5
# black 6 7 8
#数据转换
#1,删除重复数据
frame1=pd.DataFrame({'color':['red','red','white','red','white'],
'value':[2,3,1,3,1]})
print(frame1)
# color value
# 0 red 2
# 1 red 3
# 2 white 1
# 3 red 3
# 4 white 1
print(frame1.duplicated()) #查找是否有重复元素, True为重复元素
# 0 False
# 1 False
# 2 False
# 3 True
# 4 True
# dtype: bool
print(frame1[frame1.duplicated()]) #列出重复数据有哪些
# color value
# 3 red 3
# 4 white 1
frame2=frame1.drop_duplicates() #将重复元素删除
print(frame2)
# color value
# 0 red 2
# 1 red 3
# 2 white 1
#2,离散化和面元划分
results=[2,45,12,6,78,90,28,36,56,89] #用于面元划分的数字
bin=[0,25,50,75,100] #需要划分的范围
c=pd.cut(results,bin)
print(c)
# [(0, 25], (25, 50], (0, 25], (0, 25], (75, 100], (75, 100], (25, 50], (25, 50], (50, 75], (75, 100]]
# Categories (4, interval[int64]): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]
print(pd.value_counts(c)) #计算每个类别的元素个数
# (75, 100] 3
# (25, 50] 3
# (0, 25] 3
# (50, 75] 1
# dtype: int64
#对DataFrame对象重新排序
f=pd.DataFrame(np.arange(16).reshape(4,4))
print(f)
# 0 1 2 3
# 0 0 1 2 3
# 1 4 5 6 7
# 2 8 9 10 11
# 3 12 13 14 15
new_order=[3,2,0,1] #设置重新排列顺序
print(f.take(new_order))
# 0 1 2 3
# 3 12 13 14 15
# 2 8 9 10 11
# 0 0 1 2 3
# 1 4 5 6 7
#随机抽样
sample= np.random.randint(0,len(f),size=2) #size随机抽取2个
print(sample)
# [0 1] #相当于随机抽取了个 new_order
print(f.take(sample))
# 0 1 2 3
# 0 0 1 2 3
# 1 4 5 6 7
groupby()函数:
#数据聚合
frame2=pd.DataFrame({'id':['pencil','pencil','book','ball'],
'color':['red','red','white','white'],
'price1':[45,67,90,12],
'price2':[89,36,12,45]})
print(frame2)
# color id price1 price2
# 0 red pencil 45 89
# 1 red pencil 67 36
# 2 white book 90 12
# 3 white ball 12 45
#分组函数groupby()
group=frame2['price2'].groupby(frame2['color'])
print(group.groups)
# {'red': Int64Index([0, 1], dtype='int64'), 'white': Int64Index([2, 3], dtype='int64')}
print(group.mean())
# color
# red 62.5 求不同颜色的价格的平均值
# white 28.5
# Name: price2, dtype: float64
#2等级分组
print(frame2['price2'].groupby([frame2['color'],frame2['id']]).sum())
# color id 看清楚[frame2['color'],frame2['id']
# red pencil 125 将color 和 id 作为分组的依据
# white ball 45
# book 12
# Name: price2, dtype: int64
print(frame2[['price1','price2']].groupby(frame2['id']).sum())
# price1 price2
# id 看清楚frame2[['price1','price2']],中括号顺序
# ball 12 45
# book 90 12
# pencil 112 125