python数据分析pandas基础用法

import pandas as pd

df = pd.read_csv('cancer_data.csv')
df.head()

# 返回 dataframe 维度的元组
df.shape

# 返回列的数据类型
df.dtypes

# 虽然供诊断的数据类型是对象，但进一步的
# 调查显示，它是字符串
type(df['diagnosis'][0])

# 返回每列数据的有效描述性统计
df.describe()

# 但是也可以指定你希望返回的行数
df.head(20)

# `.tail()` 返回最后几行，但是也可以指定你希望返回的行数
df.tail(2)

# 查看每列的索引号和标签
for i, v in enumerate(df.columns):
print(i, v)

# 选择从 'id' 到最后一个均值列的所有列
df_means = df.loc[:,'id':'fractal_dimension_mean']
df_means.head()

# 用索引号重复以上步骤
df_means = df.iloc[:,:12]
df_means.head()

保存均值 dataframe ，以便稍后使用。

df_means.to_csv('cancer_data_edited.csv', index=False)

# 用均值填充缺失值
mean=cancer['smoothness_mean'].mean()

保存两种方法
cancer['smoothness_mean']=cancer['smoothness_mean'].fillna(mean)

cancer['smoothness_mean'].fillna(mean,inplace=True)

# 丢弃重复
cancer.drop_duplicates(inplace=True)

# 再次检查数据中的重复，确认修改
cancer.duplicated()

sum(cancer.duplicated())

重命名列

由于之前修改了数据集，使其仅包括肿瘤特征的均值，因此每个特征末尾好像不需要 "_mean" 。而且，稍后输入分析还要多耗费时间。我们现在想一些要分配给列的新标签。

new_labels = []
for col in df.columns:
if '_mean' in col:
new_labels.append(col[:-5]) # 不包括最后 6 个字符
else:
new_labels.append(col)

# 列的新标签
new_labels

# 为数据框中的列分配新标签
df.columns = new_labels

# 显示数据框的前几行，确认更改
df.head()

# 将其保存，供稍后使用
df.to_csv('cancer_data_edited.csv', index=False)

类型转换

df['timestamp']=pd.to_datetime(df['timestamp'])

但是保存后数据类型不会生效，打开后需要重新转换

使用pandas绘图

import pandas as pd
% matplotlib inline
df=pd.read_csv('powerplant_data_edited.csv')
df.info()
df.hist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
AT    9568 non-null float64
V     9568 non-null float64
AP    9568 non-null float64
RH    9568 non-null float64
PE    9568 non-null float64
dtypes: float64(5)
memory usage: 373.8 KB

df.hist(figsize=(8,8)); 扩大字体并显示不必要的内容（;的作用）

df['AP'].hist(figsize=(8,8));

单独显示

df['AP'].plot(kind='hist');

df['AP'].value_counts()

绘制饼状图

df['AP'].value_counts().plot(kind='pie',figsize=(8,8));

pd.plotting.scatter_matrix(df,figsize=(15,15))

散点图

制定xy的散点图

df.plot(x='A',y='B',kind='scatter');

# 绘制每个变量的箱线图
df['AP'].plot(kind='box');

df['AT'].value_counts().plot(kind='bar');

import pandas as pd
%matplotlib inline
df=pd.read_csv('store_data.csv')

df.head()

	week	storeA	storeB	storeC	storeD	storeE
0	2014-05-04	2643	8257	3893	6231	1294
1	2014-05-11	6444	5736	5634	7092	2907
2	2014-05-18	9646	2552	4253	5447	4736
3	2014-05-25	5960	10740	8264	6063	949
4	2014-06-01	7412	7374	3208	3985	3023

mask = df['week'] <='2016-04-03'
print(mask)

28      True
29      True
       ...  
170    False
171    False
172    False
173    False

或者

df_week=df[ df['week'] <='2016-04-03' ]

或者 df_week=df[mask]

df_week

df_week.head()

df_week.describe()

	storeA	storeB	storeC	storeD	storeE
count	1.0	1.0	1.0	1.0	1.0
mean	2054.0	1390.0	5112.0	5513.0	2536.0
std	NaN	NaN	NaN	NaN	NaN
min	2054.0	1390.0	5112.0	5513.0	2536.0
25%	2054.0	1390.0	5112.0	5513.0	2536.0
50%	2054.0	1390.0	5112.0	5513.0	2536.0
75%	2054.0	1390.0	5112.0	5513.0	2536.0
max	2054.0	1390.0	5112.0	5513.0	2536.0

df.tail(20)

	week	storeA	storeB	storeC	storeD	storeE
180	2017-10-15	8556	11984	4792	5995	2508
181	2017-10-22	3751	697	3990	4236	360
182	2017-10-29	4997	9759	4290	4568	2393
183	2017-11-05	12785	1800	6163	5157	578
184	2017-11-12	137	12261	5455	7695	2599
185	2017-11-19	9960	8529	4501	7631	505
186	2017-11-26	6866	5011	5401	4736	3232
187	2017-12-03	5179	3850	6121	6778	113
188	2017-12-10	9348	5624	5446	5448	227
189	2017-12-17	5310	8647	5680	7049	3578
190	2017-12-24	8976	9503	6240	3882	2890
191	2017-12-31	11875	1527	6711	5265	1701
192	2018-01-07	8978	11312	4158	5019	3842
193	2018-01-14	6963	4014	4215	7153	3097
194	2018-01-21	5553	3971	3761	6255	3071
195	2018-01-28	282	6351	7759	5558	1028
196	2018-02-04	4853	6503	4187	5956	1458
197	2018-02-11	9202	3677	4540	6186	243
198	2018-02-18	3512	7511	4151	5596	3501
199	2018-02-25	7560	6904	3569	5045	2585

# 最后一个月的总销售额
df.iloc[196:, 1:].sum()

storeA    25127
storeB    24595
storeC    16447
storeD    22783
storeE     7787
dtype: int64

# 平均销售额
df.mean()

storeA    5865.480
storeB    6756.710
storeC    4942.105
storeD    5431.405
storeE    2580.025
dtype: float64

# 2016 年 3 月 13 日的销售额
df[df['week'] == '2016-03-13']

	week	storeA	storeB	storeC	storeD	storeE
97	2016-03-13	2054	1390	5112	5513	2536

# C 店销售额最低的一周
df[df['storeC'] == df['storeC'].min()]

	week	storeA	storeB	storeC	storeD	storeE
9	2014-07-06	8567	3228	927	3277	168

# 最近 3 个月的总销售额
last_three_months = df[df['week'] >= '2017-12-01']
last_three_months.iloc[:, 1:].sum()

storeA    87591
storeB    79394
storeC    66538
storeD    75190
storeE    27334
dtype: int64

# 上个月的销售额
df.iloc[196:, 1:].sum().plot(kind='bar');

# 平均销售额
df.mean().plot(kind='pie');

# 2016 年 3 月 13 日所在的那一周的销售额
sales = df[df['week'] == '2016-03-13']
sales.iloc[0, 1:].plot(kind='bar');

# 过去 3 个月的销售额
last_three_months = df[df['week'] >= '2017-12-01']
last_three_months.iloc[:, 1:].sum().plot(kind='pie');

让标签顺序一致,不按照返回值排序，在value_count()后面加入index

df_a=df[ df['income'] == '<=50K' ]

ind=df_a['education'].value_counts().index

df_a['education'].value_counts()[idx].plot(kind='bar');

python数据分析pandas基础用法

重命名列

猜你喜欢