Data analysis missing values

#缺失值
data.isnull()   //将数据转换成值为True或Null
(data.isnull()).sum()   //返回每一列缺失值的个数
data.dropna(subset = ['yuanjia'], inplace=True)   //直接在源数据中删除列为 yuanjia的缺失值


#重复值
data.duplicated().sum()
data.drop_duplicates(inplace=True)

len(data)

#类型不统一
(data['shoujia'].str.contains('万')).sum()
data['shoujia'].map(lambda x:float(x.replace('万','')))

#排序
data.sort_values('licheng')
data.sort_values('licheng', ascending = False)

#统计数据中在以bins做区间时的数据的个数,如在 0-15 数据有30412个
bins = [0,15,20,25,30,35,40,45,60,75,90,100,1000,2000,6000]
pd.cut(data.Value,bins).value_counts()

#分组
//以didian来分组,并返回shoujia的平均值
data.groupby(['didian'])['shoujia'].mean()
//以didian和pinpai进行分组,返回的是相应pinpai的个数
data.groupby(['didian','pinpai'])['pinpai'].count()

Guess you like

Origin blog.csdn.net/disasters/article/details/91954142