Financial Risk Control Task2- Data Exploratory Analysis

1 Overall understanding of the data:

#导包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')

1.1 Read the data set and understand the size of the data set, the original feature dimension

train = pd.read_csv('./data/train.csv')
testA = pd.read_csv('./data/testA.csv')
print(train.shape)
print(testA.shape)
train.head(3).append(train.tail(3))

insert image description here

testA.head(3).append(testA.tail(3))

insert image description here

train.columns

insert image description here

1.2 Get familiar with data types through info

train.info()

insert image description here

1.3 Roughly view the basic statistics of each feature in the data set

train.describe()

insert image description here

2 Missing and unique values

2.1 View data missing values

#存在缺失值的列数
train.isnull().any().sum()
#查看是否存在一半以上缺失值的列
have_null_fea_dict = ((train.isnull().sum())/len(train)).to_dict()
fea_null = {
    
    }
for k,v in have_null_fea_dict.items():
    if v > 0.5:
        fea_null[k] = v
print(fea_null)

2.2 View missing features and missing rate

missing = (train.isnull().sum())/len(train)
miss = missing[missing>0]
# miss.sort_values(ascending = True)
miss = miss.sort_values(ascending=True)
miss.plot.bar()

insert image description here

2.3 View the features with only one value in the feature attribute in the training set and test set

numerical_fea = list(train.select_dtypes(exclude=['object']).columns)#数据类型
category_fea = list(filter(lambda x: x not in numerical_fea,list(train.columns)))#对象类型
print(numerical_fea)
print(category_fea)

insert image description here

2.5 Analysis of numerical variables, including continuous variables and discrete variables

2.5.1 Divide continuous variables and discrete variables in numerical variables
def get_numerical_serial_fea(data,feas):
    numerical_serial_fea = []
    numerical_noserial_fea = []
    for fea in feas:
        temp = data[fea].nunique()
        if temp <= 20:
            numerical_noserial_fea.append(fea)
            continue
        numerical_serial_fea.append(fea)
    return numerical_serial_fea,numerical_noserial_fea
numerical_serial_fea,numerical_noserial_fea=get_numerical_serial_fea(train,numerical_fea)
print(numerical_serial_fea)
print(numerical_noserial_fea)

insert image description here

2.5.2 View discrete variables one by one
train['term'].value_counts()
train['homeOwnership'].value_counts()
train['verificationStatus'].value_counts()
train['purpose'].value_counts()
train['pubRecBankruptcies'].value_counts()#数据量相差悬殊
train['initialListStatus'].value_counts()
train['applicationType'].value_counts()#数据量相差悬殊
train['policyCode'].value_counts()#单一值,无关变量
train['n11'].value_counts()#数据量相差悬殊
train['n12'].value_counts()#数据量相差悬殊
2.5.3 Numerical continuous variable analysis - distribution visualization for each numerical feature
f = pd.melt(train,value_vars=numerical_serial_fea)
g = sns.FacetGrid(f,col='variable',col_wrap=4,sharex=False,sharey=False)
g = g.map(sns.distplot,'value')

insert image description here

2.6 Analysis of non-numerical categorical variables

train['grade'].value_counts()
train['subGrade'].value_counts()
train['employmentLength'].value_counts()
train['issueDate'].value_counts()
train['earliesCreditLine'].value_counts()

3 Variable distribution visualization

3.1 Univariate distribution visualization

plt.figure(figsize=(8,8))
sns.barplot(train['employmentLength'].value_counts(dropna=False)[:20],
           train['employmentLength'].value_counts(dropna=False).keys()[:20])
plt.show()

insert image description here

3.2 Time Format Data Processing and Viewing

# train转换成时间格式
train['issueDate']=pd.to_datetime(train['issueDate'])
startdate = datetime.datetime.strptime('2007-06-01','%Y-%m-%d')
train['issueDateDT'] = train['issueDate'].apply(lambda x: x-startdate).dt.days
# testA转换成时间格式
testA['issueDate']=pd.to_datetime(testA['issueDate'])
startdate = datetime.datetime.strptime('2007-06-01','%Y-%m-%d')
testA['issueDateDT'] = testA['issueDate'].apply(lambda x: x-startdate).dt.days
# 画图
plt.hist(train['issueDateDT'],label='train')
plt.hist(testA['issueDateDT'],label='testA')
plt.legend()

insert image description here

3.3 Understanding Data with Perspectives

#透视图 索引可以有多个,“columns(列)”是可选的,聚合函数aggfunc最后是被应用到了变量“values”中你所列举的项目上。
pd.pivot_table(train, index=['grade'], columns=['issueDateDT'], values=
['loanAmnt'], aggfunc=np.sum)

insert image description here

3.4 Generating data reports with pandas_profiling

import pandas_profiling
pfr = pandas_profiling.ProfileReport(train)
pfr.to_file("./example.html")

Supongo que te gusta

Origin blog.csdn.net/BigCabbageFy/article/details/108671287
Recomendado
Clasificación