Machine learning real - risk control scorecard model

kaggle link: https: //www.kaggle.com/c/GiveMeSomeCredit

I. Introduction

Classic on kaggle risk control models: by predicting the likelihood of someone within the next two years will face financial difficulties and improve the current level of credit score

1.1 game description
Bank plays a vital role in a market economy. They decide who can get access to financing and investment decisions and under what conditions. To play a role in the market and society, individuals and companies need access to credit.

Credit scoring algorithm for guessing probability of default, the bank is used to determine whether the method should be granted the loan.

The contest requires participants to improve the state of the art credit score by predicting the likelihood of someone experiencing financial distress in the next two years.

The goal of this competition is to establish a borrower can be used to help make the best financial decision-making model. 250,000 borrowers historical data, prize pool of $ 5,000 (first prize is $ 3,000, second place is $ 1,500, third place is $ 500).

1.2 Evaluation game
using the AUC (Area Under
the ROC the Curve) performance as the performance evaluation criteria. It means the area under the ROC curve

ROC stands for receiver operating characteristic. The abscissa is the rate of false positive cases (False Positive
Rate), the ordinate is the ratio of true positives (True Positive
Rate)

1.3 Data Description
Data Dictionary (taken from the Data Dictionary.xls file):

Here to talk about, kaggle each contestant will be provided

  • Data Dictionary (which may or may be provided in the introduction a single data dictionary file, like this case)
  • A training set
  • A test set (without target)
  • Submit a sample file (in this case is sampleEntry.csv file)

 

 2, the code

#获取数据
import os
import zipfile
from six.moves import urllib

FILE_NAME = "GiveMeSomeCredit.zip" #文件名
DATA_PATH ="datasets/GiveMeSomeCredit" #存储文件的文件夹,取跟文件相同(相近)的名字便于区分
DATA_URL = "https://github.com/824024445/KaggleCases/blob/master/datasets/" + FILE_NAME + "?raw=true"

def fetch_data(data_url=DATA_URL, data_path=DATA_PATH, file_name=FILE_NAME):
    if not os.path.isdir(data_path): #查看当前文件夹下是否存在"datasets/GiveMeSomeCredit",没有的话创建
        os.makedirs(data_path)
    zip_path = os.path.join(data_path, file_name) #下载到本地的文件的路径及名称
    # urlretrieve()方法直接将远程数据下载到本地
    urllib.request.urlretrieve(data_url, zip_path) #第二个参数zip_path是保存到的本地路径
    data_zip = zipfile.ZipFile(zip_path)
    data_zip.extractall(path=data_path) #什么参数都不输入就是默认解压到当前文件,为了保持统一,是泰坦尼克的数据就全部存到GiveMeSomeCredit文件夹下
    data_zip.close()
fetch_data()

import pandas as pd
import numpy as np

train_df = pd.read_csv("datasets/GiveMeSomeCredit/cs-training.csv")
test_df = pd.read_csv("datasets/GiveMeSomeCredit/cs-test.csv")
combine=[train_df, test_df]
train_df.head()

train_df.info()
test_df.info()

train_df.describe()

import matplotlib.pyplot as plt
import seaborn as sns
#查找关联
corr_matrix = train_df.corr()
print(corr_matrix["SeriousDlqin2yrs"].sort_values(ascending=False))

#fig, ax = plt.subplots(figsize=(12,12))
#sns.heatmap(corr_matrix,xticklabels=corr_matrix.columns,yticklabels=corr_matrix.columns,annot=True)
#数据处理
#缺失值处理
for data in combine:
  data["MonthlyIncome"].fillna(data["MonthlyIncome"].mean(), inplace=True)
  data["NumberOfDependents"].fillna(data["MonthlyIncome"].mode()[0], inplace=True)
train_df.info()
#异常值处理
train_df.NumberOfDependents.value_counts()
#(家属数量居然有6670的,而且数量还不少,占到了2.6%,用0填补,因为大部分是0)
#填补前先看一下家属数和目标值的相关性,以便看一下效果,没处理前相关度-0.013881
for data in combine:
  data["NumberOfDependents"][data["NumberOfDependents"]>30] = 0
  
train_df.corr()["SeriousDlqin2yrs"]["NumberOfDependents"]
#修改异常值后"NumberOfDependents"的相关性达到了0.046869
train_df = train_df[train_df["age"]>18]
test_df = test_df[test_df["age"]>18]

combine = [train_df, test_df]

#结合已有特征,创建新特征
for data in combine:
  data["CombinedDefaulted"] = data["NumberOfTimes90DaysLate"] + data["NumberOfTime60-89DaysPastDueNotWorse"] + data["NumberOfTime30-59DaysPastDueNotWorse"]
  data.loc[(data["CombinedDefaulted"] >= 1), "CombinedDefaulted"] = 1
  data["CombinedCreditLoans"] = data["NumberOfOpenCreditLinesAndLoans"] + data["NumberRealEstateLoansOrLines"]

  data["CombinedCreditLoans"] = data["NumberOfOpenCreditLinesAndLoans"] + data["NumberRealEstateLoansOrLines"]
  data.loc[(data["CombinedCreditLoans"] <= 5), "CombinedCreditLoans"] = 0
  data.loc[(data["CombinedCreditLoans"] > 5), "CombinedCreditLoans"] = 1
#查看新建特征效果如何
train_df.corr()["SeriousDlqin2yrs"][["CombinedDefaulted", "CombinedCreditLoans"]]

#模型和预测
#数据集要再切分一下,前面的test是用来最终测试的,没有目标值,提交到kaggle之后它会返回给你一个AUC成绩,相当于评价泛化能力。而现在首先要自己评价当前模型
attributes=["SeriousDlqin2yrs", 'age','NumberOfTime30-59DaysPastDueNotWorse','NumberOfDependents','MonthlyIncome',"CombinedDefaulted","CombinedCreditLoans"]
sol=['SeriousDlqin2yrs']

attributes2 = ["Unnamed: 0", 'age','NumberOfTime30-59DaysPastDueNotWorse','NumberOfDependents','MonthlyIncome',"CombinedDefaulted","CombinedCreditLoans"]
sol=['SeriousDlqin2yrs']

train_df = train_df[attributes]
test_df = test_df[attributes2]

#####训练模型

import time
import os
from sklearn.model_selection import cross_validate

class Tester():
    def __init__(self, target):
        self.target = target
        self.datasets = {}
        self.models = {}
        self.cache = {} # 我们添加了一个简单的缓存来加快速度

    def addDataset(self, name, df):
        self.datasets[name] = df.copy()

    def addModel(self, name, model):
        self.models[name] = model
        
    def clearModels(self):
        self.models = {}

    def clearCache(self):
        self.cache = {}
    
    def testModelWithDataset(self, m_name, df_name, sample_len, cv):
        if (m_name, df_name, sample_len, cv) in self.cache:
            return self.cache[(m_name, df_name, sample_len, cv)]

        clf = self.models[m_name]
        
        if not sample_len: 
            sample = self.datasets[df_name]
        else: sample = self.datasets[df_name].sample(sample_len)
            
        X = sample.drop([self.target], axis=1)
        Y = sample[self.target]

        s = cross_validate(clf, X, Y, scoring=['roc_auc'], cv=cv, n_jobs=-1)
        self.cache[(m_name, df_name, sample_len, cv)] = s

        return s

    def runTests(self, sample_len=80000, cv=4):
        # 在所有添加的数据集上测试添加的模型
        scores = {}
        for m_name in self.models:
            for df_name in self.datasets:
                # print('Testing %s' % str((m_name, df_name)), end='')
                start = time.time()

                score = self.testModelWithDataset(m_name, df_name, sample_len, cv)
                scores[(m_name, df_name)] = score
                
                end = time.time()
                
                # print(' -- %0.2fs ' % (end - start))

        print('--- Top 10 Results ---')
        for score in sorted(scores.items(), key=lambda x: -1 * x[1]['test_roc_auc'].mean())[:10]:
            auc = score[1]['test_roc_auc']
            print("%s --> AUC: %0.4f (+/- %0.4f)" % (str(score[0]), auc.mean(), auc.std()))

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# 我们将在所有模型中使用测试对象
tester = Tester('SeriousDlqin2yrs')

# 添加数据集
tester.addDataset('Drop Missing', train_df.dropna())

# 添加模型
rfc = RandomForestClassifier(n_estimators=15, max_depth = 6, random_state=0)
log = LogisticRegression()
tester.addModel('Simple Random Forest', rfc)
tester.addModel('Simple Logistic Regression', log)
#('Simple Random Forest', 'Drop Missing') --> AUC: 0.8519 (+/- 0.0046)
#('Simple Logistic Regression', 'Drop Missing') --> AUC: 0.6511 (+/- 0.0078)
# 测试
tester.runTests()
#上面测试结果 随机森林 AUC高 X_train = train_df.drop(['SeriousDlqin2yrs'], axis=1) Y_train = train_df['SeriousDlqin2yrs'] X_test = test_df.drop(["Unnamed: 0"], axis=1) rfc.fit(X_train, Y_train) Y_pred = rfc.predict_proba(X_test)
#kaggle提交结果 submission
= pd.DataFrame({ "Id": test_df["Unnamed: 0"], "Probability": pd.DataFrame(Y_pred)[1] }) submission.to_csv('submission.csv', index=False)

 

Guess you like

Origin www.cnblogs.com/qiuyuyu/p/11388998.html