First write a dataframe
import pandas as pd
import numpy as np
test_dict = {
'id':[1,2,3,4,5,6],'name':['Alice','Bob','Cindy','Eric','Helen','Grace '],'math':[90,89,99,78,97,93],'english':[89,94,80,94,94,90]}
df = pd.DataFrame(test_dict)
df
Now remember what it looks like, and then fully manipulate it
1. View the data
#显示 dataframe 的简明摘要,包括每列非空值的数量
df.info()
# 返回每列数据的有效描述性统计
df.describe()
# 查看每列的索引号和标签
for i, v in enumerate(df.columns):
print(i, v)
loc[] and iloc[]
These two functions are used to find the data in the data frame. Simply put, the difference between them
is the name of the row and column passed in loc[]: such as loc['th row','that column']
iloc[] is the row and column index: iloc[1,1] is the second row and second column
ps:':' means all rows or columns
# 选择从 'id' 到'math.间所有列
df_means = df.loc[:,'id':'math'] #也可用索引号来实现iloc[:,:12]
df_means.head(3)
#若要选取不连续多个列还要Import numpy as np
df_max = df.iloc[:,np.r_[0:1,3:4]]
df_max
2. Treatment of missing data
Let’s change this table a bit and see that there is a null value
View missing data
df.isnull().sum()
Dealing with missing values
1. Delete directly
#axis=0表示删除这一行,=1表示删除这一列
df.dropna(axis=0,inplace=True)
2. Fill
- Fill with 0
df.fillna(0, inplace=True) # 填充 0
Fill in the mean, mode, and species
df.fillna(df.mean(),inplace=True) # 对每一列的缺失值,填充当列的均值
df.fillna(value={
'edu_deg_cd': train_tag['edu_deg_cd'].mode()[0], # 对多列来说使用众数替换缺失
'deg_cd':train_tag['deg_cd'].mode()[0],
'atdd_type': train_tag['atdd_type'].mode()[0]},inplace = True)
3. Data redundancy
df.duplicated() #来查看冗余行,
df.drop_duplicates(inplace=True) #删除冗余
4. Dirty data processing
If there are some incorrect data in the form or incorrect data, I will call this dirty data for the time being. What should I do with dirty data? Let's look at the following data frame:
import pandas as pd
import numpy as np
test_dict = {
'id':[1,2,3,4,5,6],'name':['Alice','Bob','Cindy','Eric','Helen','Grace '],'math':[90,'\\N',99,78,97,93],'english':[89,94,80,94,94,90]}
df = pd.DataFrame(test_dict)
df
First we find the row of dirty data:
df.loc[df['math']=='\\N']
Then we want to replace this dirty data with an average score
df.loc[df['math']=='\\N','math'] = df.drop(1).math.mean()
df
5. Drawing tools
1. Box plot
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10,4))
plt.xlim(-0,120)
sns.boxplot(x = df['math'])
print('Sale volume outliers:',df['math'][df['math']>100].unique())
2. Pie chart
import matplotlib.pyplot as plt
english = list(df['english'])
bins =[79,85,90,95]
english_cut = pd.cut(english, bins)
english_cut = list(english_cut)
english_list = []
count_list = []
for english in english_cut:
if english not in english_list:
count = english_cut.count(english)
english_list.append(english)
count_list.append(count)
print(english_list)
print(count_list)
# 设置显示中文
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体
plt.figure(figsize=(15,10))
plt.axis('equal') #该行代码使饼图长宽相等
plt.pie(count_list, labels=english_list, explode=(0.1,0,0.2),autopct='%1.1f%%', startangle=90)
6. Table to table connection
Here are a few functions merge, concat and append.
Let’s first create two data frames.
test_dict1 = {
'id':[1,2,3,4,5,6],'name':['Alice','Bob','Cindy','Eric','Helen','Grace '],'math':[88,89,99,78,97,93],'english':[89,94,80,94,94,90]}
df1 = pd.DataFrame(test_dict)
df1
test_dict2 = {
'id':[1,2,3,4,5,6],'name':['Alice','Bob','Cindy','Eric','Helen','Grace '],'sex':['female','male','female','female','female','female']}
df2 = pd.DataFrame(test_dict)
df2
- The merge function, by default, will be connected according to the same field, other parameters are generally not used, mainly only pairwise splicing
df1.merge(df2)
- concat() function
pd.concat(objs, # 要合并对象
axis=0, # 选择合并轴,0按列,1按行
join='outer', # 连接方式,默认并集
join_axes=None, #参数 join_axes 可指定 index 来对齐数据。这样会切掉指定的 index 之外的数据
ignore_index=False, #当设为 ignore_index=True 时,新 df 将不会使用拼接成员 df 的 index,而是重新生成一个从 0 开始的 index 值
keys=None,
levels=None,
names=None,
verify_integrity=False,
copy=True
)
pd.concat([df1,df2],axis=1)
pd.concat([df1,df2],axis=0)
- The append function adds the appended object to the end of the caller (similar to the list method).
DataFrame.append(other,
ignore_index=False,
verify_integrity=False,
sort=None
)
df1.append(df2)
7. Change categorical variables to numerical variables
Let’s look at a table like this and want to turn gender into a numerical variable
import pandas as pd
import numpy as np
test_dict = {
'id':[1,2,3,4,5,6],'name':['Alice','Bob','Cindy','Eric','Helen','Grace '],'math':[90,90,99,78,97,93],'sex':['F','M','F','M','M','M']}
df = pd.DataFrame(test_dict)
df
Method 1: Use sklearn
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df['sex'] = class_le.fit_transform(df['sex'].values)
df
2, The mapping dictionary converts the class label to an integer
import numpy as np
# class_mapping = {label: idx for idx, label in enumerate(np.unique(df['sex']))}
class_mapping = {
label: idx for idx, label in [[1,'M'],[0,'F']]}
df['sex'] = df['sex'].map(class_mapping)
df
3. Use one-hot encoding to create a new virtual feature
#3,使用one-hot编码创建一个新的虚拟特征
from sklearn.preprocessing import OneHotEncoder
pf = pd.get_dummies(df[['sex']]) #生成两列数据sex_F和sex_M,对应性别的行为1,否则为0
df = pd.concat([df, pf], axis=1)
df.drop(['sex'], axis=1, inplace=True)
df
8. Change the data type
Here is a function for everyone to understand
def downcast_dtypes(df):
cols_float = [c for c in df if df[c].dtype == 'float66']
cols_object = [c for c in df if df[c].dtype == 'object']
cols_int64_32 = [c for c in df if df[c].dtype in ['int64', 'int32']]
df[cols_float] = df[cols_object].astype(np.float32)
df[cols_object] = df[cols_object].astype(np.float32)
df[cols_int64_32] = df[cols_int64_32].astype(np.int16)
return df