在平时的工作中,不管是建模还是做一些变量分析探索很多时候都需要了解变量对好坏客户的区分能力如何,这种情况下通常需要计算变量的IV值,IV的计算公式是
其中,gpct是好客户的占比,bpct表示坏客户的占比。
下面直接给出SAS计算变量IV值得代码,这段代码是我自己写的,适用于数值型和字符型变量的IV计算,在调用之前还是要做一些相应的设置的,比如路径、数据集、特殊值等。
上表是经验上的参考值,IV越大对模型的效果来说肯定更好,在实际中很多时候IV高的变量非常少,更多地靠众多低IV的变量来构建模型,这时候如果需要模型有一个比较好的区分效果可能就要从算法方面做一些改变了,比如用集成算法或深度学习算法去代替传统的LR模型。
LIBNAME IV 'F:\WORK\IV';*存放结果路径;
/***********跑IV前需要进行相应设置*****************/
%LET INDATA=Creditcard1;*原数据集;
%LET EXP_VAR='CustID','Status';*不参与计算的变量如ID,好坏标签等,注意大小写;
%LET SPE_VAR=.,-9999,9999,-9993;*数值变量特殊值,需要做更改;
%LET N=10;*除特殊值外,数值变量最大分组数;
%LET GB=Status;*好坏标签字段,用0,1表示;
/***********跑IV前需要进行相应设置*****************/
%MACRO IV;
PROC CONTENTS DATA=&INDATA
OUT=VARLIST(KEEP=NAME TYPE WHERE=(NAME NOT IN (&EXP_VAR))) NOPRINT;
RUN;
DATA DATA_CHAR DATA_NUM;
SET VARLIST;
IF TYPE=1 THEN OUTPUT DATA_NUM;
ELSE OUTPUT DATA_CHAR;
RUN;
%LET DISD=%SYSFUNC(OPEN(DATA_CHAR));
%LET NCHAR=%SYSFUNC(ATTRN(&DISD,NOBS));
%PUT &NCHAR;
%LET C=%SYSFUNC(CLOSE(&DISD));
%LET DISD=%SYSFUNC(OPEN(DATA_NUM));
%LET NNUM=%SYSFUNC(ATTRN(&DISD,NOBS));
%PUT &NNUM;
%LET C=%SYSFUNC(CLOSE(&DISD));
/*数值变量计算IV*/
%IF &NNUM>0 %THEN %DO;
DATA _NULL_;
SET DATA_NUM END=LAST;
CALL SYMPUT(COMPRESS('NVAR'||_N_),NAME);
IF LAST THEN CALL SYMPUT('N_NOBS',_N_);
RUN;
%MACRO NUM_IV;
DATA IV.IV_NUM_VAR;
FORMAT LABEL $100. NAME $32. IV;
STOP;
RUN;
%DO I=1 %TO &N_NOBS;
DATA DATA_REG DATA_NREG;
SET &INDATA(KEEP=&&NVAR&I &GB);
IF &&NVAR&I IN (&SPE_VAR) THEN OUTPUT DATA_NREG;
ELSE OUTPUT DATA_REG;
RUN;
%LET TDISD=%SYSFUNC(OPEN(DATA_NREG));
%LET TNNUM=%SYSFUNC(ATTRN(&TDISD,NOBS));
%PUT &TNNUM;
%LET TC=%SYSFUNC(CLOSE(&TDISD));
PROC RANK DATA=DATA_REG OUT=RANK GROUPS=&N;
VAR &&NVAR&I;
RANKS BIN;
RUN;
PROC MEANS DATA=RANK N MAX MIN NOPRINT;
CLASS BIN;
VAR &&NVAR&I;
OUTPUT OUT=RANK1;
RUN;
PROC TRANSPOSE DATA=RANK1(WHERE=(NOT MISSING(BIN) AND _STAT_ IN ('MAX','MIN')))
OUT=RANK2(DROP =_NAME_ RENAME=(MIN=START MAX=END));
BY BIN;
ID _STAT_;
VAR &&NVAR&I;
RUN;
DATA RANK3;
SET RANK2;
LENGTH BIN_GROUP $30.;
BIN_GROUP=COMPRESS(_N_||'、'||START||'-'||END);
RUN;
PROC SQL;
CREATE TABLE RANK4 AS
SELECT
BIN,
SUM(CASE WHEN &GB=0 THEN 1 ELSE 0 END) AS G_CNT,
SUM(CASE WHEN &GB=1 THEN 1 ELSE 0 END) AS B_CNT
FROM RANK
GROUP BY
BIN;
QUIT;
DATA RANK5;
FORMAT BIN _LABEL_ BIN_GROUP;
MERGE RANK3 RANK4;
BY BIN;
RUN;
%IF &TNNUM>0 %THEN %DO;
PROC SQL NOPRINT;
SELECT DISTINCT &&NVAR&I INTO :NREG_VAR SEPARATED BY ',' FROM DATA_NREG;
CREATE TABLE DATA_NREG1 AS
SELECT
"0、&NREG_VAR" AS BIN_GROUP FORMAT $30.,
SUM(CASE WHEN &GB=0 THEN 1 ELSE 0 END) AS G_CNT,
SUM(CASE WHEN &GB=1 THEN 1 ELSE 0 END) AS B_CNT
FROM DATA_NREG
GROUP BY
"0、&NREG_VAR";
QUIT;
DATA RANK6;
FORMAT _LABEL_ BIN_GROUP $30. START END G_CNT B_CNT;
SET DATA_NREG1 RANK5;
KEEP _LABEL_ BIN_GROUP START END G_CNT B_CNT;
RUN;
%END;
%ELSE %DO;
DATA RANK6;
FORMAT _LABEL_ BIN_GROUP $30. START END G_CNT B_CNT;
SET RANK5;
KEEP _LABEL_ BIN_GROUP START END G_CNT B_CNT;
RUN;
%END;
DATA RANK7;
SET RANK6 END=LAST;
IF _N_=1 THEN DO;
CUM_G_CNT=0;
CUM_B_CNT=0;
END;
CUM_G_CNT+G_CNT;
CUM_B_CNT+B_CNT;
IF LAST THEN DO;
CALL SYMPUT('TOTG',CUM_G_CNT);
CALL SYMPUT('TOTB',CUM_B_CNT);
END;
RUN;
DATA IV.&&NVAR&I;
SET RANK7 END=LAST;
G_CNT_PCT=ROUND(G_CNT/&TOTG,0.0001);
B_CNT_PCT=ROUND(B_CNT/&TOTB,0.0001);
BAD_RATE=ROUND(B_CNT/(B_CNT+G_CNT),0.0001);
IF B_CNT_PCT^=0 AND G_CNT_PCT^=0 THEN WOE=ROUND(LOG(G_CNT_PCT/B_CNT_PCT),0.0001);
ELSE WOE=0;
IV_SEG=ROUND((G_CNT_PCT-B_CNT_PCT)*WOE,0.0001);
IF _N_=1 THEN IV=0;
IV+IV_SEG;
KEEP _LABEL_ BIN_GROUP START END G_CNT B_CNT BAD_RATE WOE IV;
RENAME _LABEL_=LABEL;
RUN;
DATA TEMP_IV;
FORMAT LABEL NAME $32. IV;
SET IV.&&NVAR&I END=LAST;
NAME="&&NVAR&I";
IF LAST;
KEEP LABEL NAME IV;
RUN;
DATA IV.IV_NUM_VAR;
SET IV.IV_NUM_VAR TEMP_IV;
RUN;
%END;
%MEND;
%NUM_IV;
%END;
/*字符变量计算IV*/
%IF &NCHAR>0 %THEN %DO;
DATA _NULL_;
SET DATA_CHAR END=LAST;
CALL SYMPUT(COMPRESS('CVAR'||_N_),NAME);
IF LAST THEN CALL SYMPUT('C_NOBS',_N_);
RUN;
%MACRO CHAR_IV;
DATA IV.IV_CHAR_VAR;
FORMAT LABEL $100. NAME $32. IV;
STOP;
RUN;
%DO I=1 %TO &C_NOBS;
DATA DATA_REG;
SET &INDATA(KEEP=&&CVAR&I &GB);
RUN;
PROC SQL;
CREATE TABLE RANK1 AS
SELECT
&&CVAR&I,
SUM(CASE WHEN &GB=0 THEN 1 ELSE 0 END) AS G_CNT,
SUM(CASE WHEN &GB=1 THEN 1 ELSE 0 END) AS B_CNT
FROM DATA_REG
GROUP BY
&&CVAR&I;
QUIT;
DATA RANK2;
FORMAT LABEL BIN_GROUP;
SET RANK1 END=LAST;
LABEL=VLABEL(&&CVAR&I);
BIN_GROUP=COMPRESS(_N_||'、'||&&CVAR&I);
IF _N_=1 THEN DO;
CUM_G_CNT=0;
CUM_B_CNT=0;
END;
CUM_G_CNT+G_CNT;
CUM_B_CNT+B_CNT;
IF LAST THEN DO;
CALL SYMPUT('TOTG',CUM_G_CNT);
CALL SYMPUT('TOTB',CUM_B_CNT);
END;
RUN;
DATA IV.&&CVAR&I;
SET RANK2 END=LAST;
G_CNT_PCT=ROUND(G_CNT/&TOTG,0.0001);
B_CNT_PCT=ROUND(B_CNT/&TOTG,0.0001);
BAD_RATE=ROUND(B_CNT/(B_CNT+G_CNT),0.0001);
IF B_CNT_PCT^=0 AND G_CNT_PCT^=0 THEN WOE=ROUND(LOG(G_CNT_PCT/B_CNT_PCT),0.0001);
ELSE WOE=0;
IV_SEG=ROUND((G_CNT_PCT-B_CNT_PCT)*WOE,0.0001);
IF _N_=1 THEN IV=0;
IV+IV_SEG;
KEEP _LABEL_ BIN_GROUP G_CNT B_CNT BAD_RATE WOE IV;
RUN;
DATA TEMP_IV;
FORMAT LABEL NAME $32. IV;
SET IV.&&CVAR&I END=LAST;
NAME="&&CVAR&I";
IF LAST;
KEEP LABEL NAME IV;
RUN;
DATA IV.IV_CHAR_VAR;
SET IV.IV_CHAR_VAR TEMP_IV;
RUN;
%END;
%MEND;
%CHAR_IV;
%END;
%MEND;
%IV;