Pharmaceutical Statistics Project Contact QQ: 231469242
If the sample size is too small, the data must be segmented, otherwise there will be a lot of vacant data, and the woe effect cannot be effectively exerted
Random Forest Results
The factors with iv > 0.02 are all effective factors in the random forest results, but the most important factor in the random forest does not appear in the effective iv parameters, indicating that these missing important variables are not segmented, and the data is scattered.
data files
script backup
step1_customers_split_goodOrBad.py
# -*- coding: utf-8 -*- """ Created on Sun Jan 14 21:45:43 2018 @author QQ:231469242 Classify the data source into two Excels, good customer Excel data and bad customer Excel data """ import pandas as pd import numpy as np import matplotlib.pyplot as plt #read file readFileName="breast_cancer_总.xlsx" #save document saveFileName_good="result_good.xlsx" saveFileName_bad="result_bad.xlsx" #read excel df=pd.read_excel(readFileName) #handsome selection data df_good=df[df.diagnosis=="B"] df_bad=df[df.diagnosis=="M"] #save data df_good.to_excel(saveFileName_good, sheet_name='Sheet1') df_bad.to_excel(saveFileName_bad, sheet_name='Sheet1')
step2_automate_find_informative_variables.py
# -*- coding: utf-8 -*- """ Created on Sun Jan 14 22:13:30 2018 @author: QQ:231469242 woe negative number, good customer < bad customer woe positive number, good customer > bad customer """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import them #Create save file newFile=os.mkdir("save/") #read file FileName_good="result_good.xlsx" FileName_bad="result_bad.xlsx" #save document saveFileName="result_woe_iv.xlsx" #read excel df_good=pd.read_excel(FileName_good) df_bad=pd.read_excel(FileName_bad) # list of all variables list_columns=list(df_good.columns[:-1]) index=0 def Ratio_goodDevideBad(index): #First column field name (good customer attribute) columnName=list(df_good.columns)[index] #The first column of good customer content and the second column of bad customer content column_goodCustomers=df_good[columnName] column_badCustomers=df_bad[columnName] #remove NAN num_goodCustomers=column_goodCustomers.dropna() #total number num_goodCustomers=num_goodCustomers.size #remove NAN num_badCustomers=column_badCustomers.dropna() #total number num_badCustomers=num_badCustomers.size #First column frequency analysis frenquency_goodCustomers=column_goodCustomers.value_counts() # second column frequency analysis frenquency_badCustomers=column_badCustomers.value_counts() #The proportion of each element ratio_goodCustomers=frenquency_goodCustomers/num_goodCustomers ratio_badCustomers=frenquency_badCustomers/num_badCustomers #Final good to bad ratio ratio_goodDevideBad=ratio_goodCustomers/ratio_badCustomers return (columnName,num_goodCustomers,num_badCustomers,frenquency_goodCustomers,frenquency_badCustomers,ratio_goodCustomers,ratio_badCustomers,ratio_goodDevideBad) #woe function, array calculation def Woe(ratio_goodDevideBad): woe=np.log(ratio_goodDevideBad) return woe ''' #iv function, array calculation def Iv(woe): iv=(ratio_goodCustomers-ratio_badCustomers)*woe return iv ''' #iv parameter evaluation, parameter iv_sum (variable iv total value) def Iv_estimate(iv_sum): #If the iv value is greater than 0.02, it is a valid factor if iv_sum>0.02: print("informative") return "A" #Assessment ability in general else: print("not informative") return "B" ''' #Detailed parameter output def Print(): print ("columnName:",columnName) Iv_estimate(iv_sum) print("iv_sum",iv_sum) #print("",) #print("",) ''' #Save detailed parameters to excel, save file def Write_singleVariable_to_Excel(index): #index is the variable index, the first variable, index=0 ratio=Ratio_goodDevideBad(index) columnName,num_goodCustomers,num_badCustomers,frenquency_goodCustomers,frenquency_badCustomers,ratio_goodCustomers,ratio_badCustomers,ratio_goodDevideBad=ratio[0],ratio[1],ratio[2],ratio[3],ratio[4],ratio[5],ratio[6],ratio[7] woe=Woe(ratio_goodDevideBad) iv=(ratio_goodCustomers-ratio_badCustomers)*woe df_woe_iv=pd.DataFrame({"num_goodCustomers":num_goodCustomers,"num_badCustomers":num_badCustomers,"frenquency_goodCustomers":frenquency_goodCustomers, "frenquency_badCustomers":frenquency_badCustomers,"ratio_goodCustomers":ratio_goodCustomers, "ratio_badCustomers":ratio_badCustomers,"ratio_goodDevideBad":ratio_goodDevideBad, "woe":woe,"iv":iv},columns=["num_goodCustomers","num_badCustomers","frenquency_goodCustomers","frenquency_badCustomers", "ratio_goodCustomers","ratio_badCustomers","ratio_goodDevideBad","woe","iv"]) #sort_values(by=...) is used to sort the specified field df_sort=df_woe_iv.sort_values(by='iv',ascending=False) #ratio_badDevideGood data is written to result_compare_badDevideGood.xlsx file df_sort.to_excel("save/"+columnName+".xlsx") #Calculate the sum of iv, evaluate the overall variable iv_sum=sum([i for i in iv if np.isnan(i)!=True]) print("Variable:",columnName) #iv parameter evaluation, parameter iv_sum (variable iv total value) iv_estimate=Iv_estimate(iv_sum) print("iv_sum",iv_sum) return iv_estimate,columnName #y\Value variable list memory list_Informative_variables=[] #Write all variable parameters, save to excel, save file for i in range(len(list_columns)): status=Write_singleVariable_to_Excel(i)[0] columnName=Write_singleVariable_to_Excel(i)[1] if status=="A": list_Informative_variables.append(columnName)