#coding=utf-8
import pandas as pd
import numpy as np
import math
data = pd.read_csv('useriddayu1.csv')
df=data.loc[()]
data.shape #(34250, 8)
df.shape #(34250, 8)
#df=df.head()
grouped=df.groupby(df['userid'])
def getIndexSize(df):
grouped=df.groupby(df['userid'])
count=0
for n,g in grouped:#这一行是n,g两个变量从grouped的返回值里面取元素
count+=1#这个等价于count=count+1
return count
results = pd.DataFrame(index=range(getIndexSize(df)),columns=df.columns)#存放处理后的数据结果
ind=0
for id,pp in grouped:
#print(id,"\n",pp)
lenning=len(pp)
for u in df.columns:
print("===",u)
if(u=='userid'):
results.iloc[ind]['userid']=id
continue
statics=pp[u].value_counts()
sum=0.0
for i in statics:
p=(float)(i/lenning)
sum-=p*math.log(p)
print("sum: ",sum)
results.iloc[ind][u]=sum
ind+=1
print("wowowowowowowow")
#print(results)
results.to_csv('10.csv',columns=df.columns,index=False)
Python计算信息熵代码
猜你喜欢
转载自blog.csdn.net/youxinyuchu/article/details/83821612
今日推荐
周排行