利用python进行数据分析-pandas.concat/subplots/gropuby/pivot_table,多文件整合、聚合、分组，子图

import pandas as pd
from matplotlib import pyplot as plt

import numpy as np

# 文件中年份为 1880 年 -2011 年

years=range(1880,2011)

pieces=[]

#列名

columns=['name','sex','births']

for year in years:

    #通用的路径名

    path='D:\\python program\\names\\babynames\\yob%d.txt' % year

    #读取文件并生成框表

    frame=pd.read_csv(path,names=columns)

    #添加‘year’列

    frame['year']=year

    pieces.append(frame)

#将所有数据整合到单个dataframe中，必须指定ignore_index=True，避免返回原始行号

names=pd.concat(pieces,ignore_index=True)

#在year和sex级别上对其进行聚合

total_births=pd.pivot_table(names,index='year',columns=['sex'],

                            values=['births'],aggfunc=sum)

#print(total_births.tail())   输出

# 绘图

plt.plot(total_births)

plt.title('total births by sex and yeae')

plt.show()

此时生成的names数据为：

---------------------------------------------------------------------

#利用groupby对year和sex分组

names.groupby(['year','sex']).births.sum()

----------------------------------------------------------------------------------------------

#插入一个prop列，指定名字的婴儿数相对于出生总数的比例

def add_prop(group):
    #整数除法会向下圆整

    births=group.births.astype(float)  #类型转换为浮点型
    group['prop']=births/births.sum()
    return group
names=names.groupby(['year','sex']).apply(add_prop) #将新列加到各个分组上

图为：

------------------------------------------------------------------------------

****进行验证，通过np.allclose检查这个分组的总计值是否为1，是输出True

print(np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1))

------------------------------------------------------------------------------------

#分组收集数据的一个子集：每对sex、year组合的前1000个名字

def get_top1000(group):
    return group.sort_values(by='births',ascending=False)[0:1000] #多列排序
grouped=names.groupby(['year','sex'])
top1000=grouped.apply(get_top1000)

图：

-----------------------------------------------------------------------------------------------

分别索引男、女

boys=top1000[top1000.sex=='M']
girls=top1000[top1000.sex=='F']

---------------------------------------------------------------------------------------

在这1000生成按照year和name统计的总的出生数据透视表

total_births=pd.pivot_table(top1000,values=['births'],index=['year'],columns='name',aggfunc=sum)

--------------------------------------------------------------------------------------

在一张图布上根据4个名字查看随年份命名的变化，建立4个折线图

subset = total_births.births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,grid=False,figsize=(12,10),title="numbers of births per year")

plt.show()

图：

--------------------------------------------------------------------------------------------

计算最流行的1000个名字所占的比例，按照year和sex聚合并绘图

table=pd.pivot_table(top1000,values='prop',index=['year'],columns='sex',aggfunc=sum)
table.plot(title="sum of table1000.pro by year and sex",yticks=np.linspace(0,1.2,13),xticks=range(1880,2020,10))
plt.show()

图：分性别统计的前1000个名字在总出生人数中的比例

-----------------------------------------------------------------------------------------------

计算前2010年男孩的名字，对prop降序后。多少个名字的人数加起来才够50%

#2010年男孩的名字
df=boys[boys.year==2010]
prop_cumsum=df.sort_values(by='prop',ascending=False).prop.cumsum() #计算prop的累积和cumsum
print(prop_cumsum.searchsorted(0.5)+1)     #通过searchsorted找出0.5被插在哪个位置。数组索引从0开始，所以+1

---------------------------------------------------------------------------------------------------

按照上例对所有的year/sex组合执行上诉计算。这两个字段进行groupby处理，然后用一个函数计算各分组的这个值

def get_quantile_count(group,q=0.5):
    group=group.sort_values(by='prop',ascending=False)
    return group.prop.cumsum().searchsorted(q)+1
diversity=top1000.groupby(['year','sex']).apply(get_quantile_count)
diversity=diversity.unstack('sex')
plt.plot(diversity)
plt.title("number of popular names in top 50%")
plt.show()

图：每年的prop在前1000个名字中，累计达到50%的位置按照sex分类的趋势图

-------------------------------------------------------------------------------------

分析名字中最后一个字母上的分布变化

#从name列取出最后一个字母
get_last_letter=lambda x:x[-1]
last_letters=names.name.map(get_last_letter)
last_letters.name='last_letter'
#将全部出生数据在year、sex以及末字母进行聚合
table=pd.pivot_table(names,values='births',index=last_letters,columns=['sex','year'],aggfunc=sum)
#选出具有一定代表性的三年
subtable=table.reindex(columns=[1910,1960,2010],level='year')

图：

-----------------------------------------------------------------------------------------

各年度各性别的名字末字母所占总人数的条形图

#各性别各末字母占总出生人数的比例
letter_prop=subtable/subtable.sum().astype(float)
#生成条形图
fig,axes=plt.subplots(2,1,figsize=(10,10))
letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title="Male")
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title="Female",legend=False)
plt.show()

图：

------------------------------------------------------------------------------------------------------

上例中男孩名字末字母所占比例的变化，本例选取几个特定字母进行分析

#对完整table按year和sex规范化处理，并在男孩名字中选取几个字母，查看比例
letter_prop=table/table.sum().astype(float)
dny_ts=letter_prop.ix[['d','n','y'],'M'].T
dny_ts.plot()
plt.show()

图：各年出生的男孩中名字以d/n/y结尾的人数比例

-------------------------------------------------------------------------------

早年流行语男孩名字近年来流行语女孩，回到top1000数据集，找出以‘lesl’开头的一组名字

#变成女孩的男孩名字
all_names=top1000.name.unique()
#查找以lesl开头的所有名字
mask=np.array(['lesl' in x.lower() for x in all_names])
lesl_like=all_names[mask]
#计算以lesl开头的名字出生数
filtered=top1000[top1000.name.isin(lesl_like)]
filtered.groupby('name').births.sum()

图：

---------------------------------------------------------------------------------------

分析各年以‘lesl’开头的名字男女比例

#按照sex和year聚合
table=pd.pivot_table(filtered,values='births',index='year',columns='sex',aggfunc=sum)
####################数字比例特别处理
table=table.div(table.sum(1),axis=0)
#画图
table.plot(style={'M':'k-','F':'k--'})
plt.show()

图：各年度使用‘lesl’型名字的男女比例

利用python进行数据分析-pandas.concat/subplots/gropuby/pivot_table,多文件整合、聚合、分组，子图

猜你喜欢