sklearn-woe/iv-breast cancer classifier combat

 

https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

 

 

Pharmaceutical Statistics Project Contact QQ: 231469242

 

If the sample size is too small, the data must be segmented, otherwise there will be a lot of vacant data, and the woe effect cannot be effectively exerted

 

 

Random Forest Results

The factors with iv > 0.02 are all effective factors in the random forest results, but the most important factor in the random forest does not appear in the effective iv parameters, indicating that these missing important variables are not segmented, and the data is scattered.

 

 

data files

 

script backup

step1_customers_split_goodOrBad.py

# -*- coding: utf-8 -*-
"""
Created on Sun Jan 14 21:45:43 2018

@author  QQ:231469242

Classify the data source into two Excels, good customer Excel data and bad customer Excel data
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#read file
readFileName="breast_cancer_总.xlsx"

#save document
saveFileName_good="result_good.xlsx"
saveFileName_bad="result_bad.xlsx"

#read excel
df=pd.read_excel(readFileName)
#handsome selection data
df_good=df[df.diagnosis=="B"]
df_bad=df[df.diagnosis=="M"]

#save data
df_good.to_excel(saveFileName_good, sheet_name='Sheet1')
df_bad.to_excel(saveFileName_bad, sheet_name='Sheet1')

  

step2_automate_find_informative_variables.py

# -*- coding: utf-8 -*-
"""
Created on Sun Jan 14 22:13:30 2018

@author: QQ:231469242
woe negative number, good customer < bad customer
woe positive number, good customer > bad customer
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import them

#Create save file
newFile=os.mkdir("save/")

#read file
FileName_good="result_good.xlsx"
FileName_bad="result_bad.xlsx"

#save document
saveFileName="result_woe_iv.xlsx"

#read excel
df_good=pd.read_excel(FileName_good)
df_bad=pd.read_excel(FileName_bad)

# list of all variables
list_columns=list(df_good.columns[:-1])

index=0

def Ratio_goodDevideBad(index):
    #First column field name (good customer attribute)
    columnName=list(df_good.columns)[index]

    #The first column of good customer content and the second column of bad customer content
    column_goodCustomers=df_good[columnName]
    column_badCustomers=df_bad[columnName]

    #remove NAN
    num_goodCustomers=column_goodCustomers.dropna()
    #total number
    num_goodCustomers=num_goodCustomers.size

    #remove NAN
    num_badCustomers=column_badCustomers.dropna()
    #total number
    num_badCustomers=num_badCustomers.size
    

    #First column frequency analysis
    frenquency_goodCustomers=column_goodCustomers.value_counts()
    # second column frequency analysis
    frenquency_badCustomers=column_badCustomers.value_counts()
   
    #The proportion of each element
    ratio_goodCustomers=frenquency_goodCustomers/num_goodCustomers
    ratio_badCustomers=frenquency_badCustomers/num_badCustomers
    #Final good to bad ratio
    ratio_goodDevideBad=ratio_goodCustomers/ratio_badCustomers
    return (columnName,num_goodCustomers,num_badCustomers,frenquency_goodCustomers,frenquency_badCustomers,ratio_goodCustomers,ratio_badCustomers,ratio_goodDevideBad)

#woe function, array calculation
def Woe(ratio_goodDevideBad):
    woe=np.log(ratio_goodDevideBad)
    return woe

'''    
#iv function, array calculation
def Iv(woe):
    iv=(ratio_goodCustomers-ratio_badCustomers)*woe
    return iv
    '''

#iv parameter evaluation, parameter iv_sum (variable iv total value)
def Iv_estimate(iv_sum):
    #If the iv value is greater than 0.02, it is a valid factor
    if iv_sum>0.02:
        print("informative")
        return "A"
    #Assessment ability in general
    else:
        print("not informative")
        return "B"
   
    
'''
#Detailed parameter output
def Print():
    print ("columnName:",columnName)
    Iv_estimate(iv_sum)
    print("iv_sum",iv_sum)
    #print("",)
    #print("",)
    '''
    
#Save detailed parameters to excel, save file    
def Write_singleVariable_to_Excel(index):
    #index is the variable index, the first variable, index=0
    ratio=Ratio_goodDevideBad(index)
    columnName,num_goodCustomers,num_badCustomers,frenquency_goodCustomers,frenquency_badCustomers,ratio_goodCustomers,ratio_badCustomers,ratio_goodDevideBad=ratio[0],ratio[1],ratio[2],ratio[3],ratio[4],ratio[5],ratio[6],ratio[7]

    woe=Woe(ratio_goodDevideBad)
    iv=(ratio_goodCustomers-ratio_badCustomers)*woe
    
    df_woe_iv=pd.DataFrame({"num_goodCustomers":num_goodCustomers,"num_badCustomers":num_badCustomers,"frenquency_goodCustomers":frenquency_goodCustomers,
    "frenquency_badCustomers":frenquency_badCustomers,"ratio_goodCustomers":ratio_goodCustomers,
    "ratio_badCustomers":ratio_badCustomers,"ratio_goodDevideBad":ratio_goodDevideBad,
    "woe":woe,"iv":iv},columns=["num_goodCustomers","num_badCustomers","frenquency_goodCustomers","frenquency_badCustomers",
    "ratio_goodCustomers","ratio_badCustomers","ratio_goodDevideBad","woe","iv"])
    
    
    #sort_values(by=...) is used to sort the specified field
    df_sort=df_woe_iv.sort_values(by='iv',ascending=False)

    #ratio_badDevideGood data is written to result_compare_badDevideGood.xlsx file
    df_sort.to_excel("save/"+columnName+".xlsx")


    #Calculate the sum of iv, evaluate the overall variable
    iv_sum=sum([i for i in iv if np.isnan(i)!=True])

    print("Variable:",columnName)
    #iv parameter evaluation, parameter iv_sum (variable iv total value)
    iv_estimate=Iv_estimate(iv_sum)
    print("iv_sum",iv_sum)
    return iv_estimate,columnName



#y\Value variable list memory
list_Informative_variables=[]


#Write all variable parameters, save to excel, save file
for i in range(len(list_columns)):
    status=Write_singleVariable_to_Excel(i)[0]
    columnName=Write_singleVariable_to_Excel(i)[1]
    
    if status=="A":
        list_Informative_variables.append(columnName)

        

 

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326263342&siteId=291194637