热线电话:13121318867

登录
首页精彩阅读SAS信用卡评分之变量分段
SAS信用卡评分之变量分段
2017-06-13
收藏

当评分卡建模的时候用的逻辑回归,最好是将连续变量变成分段变量,即字符变量。把字符变量,观测的种类达到10种以上的时候,建议分下类,最好每个变量(无论数值还是字符)控制在3-7段之间,这是我的建议哈,要是你领导叫你分8段,你就千万就要听领导的。
   然后,我来说下,我这里的最优分段怎么就是最优的呢?
01字符变量

  先发这张图给粘上来,然后我就用简单粗暴的语言解释最优是怎么最优的。就是先把每种情况都列出来,刚开始每一种情况都是一类,然后你还要输入因变量,所以1中就是找出最优的二元分割方法,把原来的一大群先分两大类,然后第2的套路还是跟1一样的,知道分成5份。你问我二元分割最好的指标是什么,你还记得我之前写的代码之前都有带“基尼系数”,“iv值”吗,就是按照这个指标去分的啦。然后这里还要说一点就是,你要是一个变量,总共就1、2、3、4种情况,然后还要最优分段分五份这不是为难嘛。假设你觉得4种情况的分层没有特点,想分的有特点一点,那就可以试着分成3份,2份。分出来的结果对比一下iv值,要是3份的iv值比4份还高或者一样的话,那就是3份还要好些,因为我们都知道变量分段越多iv值越高。
02数值变量

  这是数值变量最优分段的图,其实套路跟字符变量很像,但是数值变量就多了顺序,所以还是有点跟字符有点不像。首先连续变量被分为大量等距的小分段,譬如区间是100的变量,然后就分成50段,那么就是1-2就是一组。那按照跟刚才的字符变量一样的分法类似,就是先分两份,只是对于字符变量多了顺序。但是这里这里要注意一点就是,你本来1、2、3代表的是类的话,在这里就需要把他转成字符,就不要是数值丢进去分段。同样的,要是你不知道分几段的时候,试几次,看下iv值,取一个你觉得最好的iv值。
[卖萌蔬菜动图特殊用途]
我是分割线
好的,两种变量的分类也就这样了啦,好像也没写多少字哦,那就贴代码吧。
options mlogic;

options nomlogic;

%macro gvalue(binds,m_value);

proc sql noprint;

%local i j R N;/*生成局部变量*/

select max(bin)into:R from &binds;/**/

select sum(total) into: N from &binds;/**/

  %do i=1 %to &R;

      %local N_&i._1 N_&i._2 N_&i._s N_s_1 N_s_2;

 Select sum(Ni1) into :N_&i._1 from &BinDS where Bin =&i ;

 Select sum(Ni2) into :N_&i._2 from &BinDS where Bin =&i ;

 Select sum(Total) into :N_&i._s from &BinDS where Bin =&i ;

 Select sum(Ni1) into :N_s_1 from &BinDS ;

 Select sum(Ni2) into :N_s_2 from &BinDS ;

%end;

quit;

/* 检查缺失值 */

   %do i=1 %to &R;

    %do j=1 %to 2;

     %local N_&i._&j;

      %if (&&N_&i._&j=.) or (&&N_&i._&j=0) %then %do ;

       %let &M_Value=.;

       %return;

      %end;

     %end;

   %end;

  %do i=1 %to &r;

     %local E_&i;

     %let E_&i=0;

       %do j=1 %to 2;

       %let E_&i = %sysevalf(&&E_&i - (&&N_&i._&j/&&N_&i._s)*%sysfunc(log(%sysevalf(&&N_&i._&j/&&N_&i._s))) );

       %end;

      %let E_&i = %sysevalf(&&E_&i/%sysfunc(log(2)));

   %end;

   %local E;

    %let E=0;

    %do j=1 %to 2;

    %let E=%sysevalf(&E - (&&N_s_&j/&N)*%sysfunc(log(&&N_s_&j/&N)) );

    %end;

    %let E=%sysevalf(&E / %sysfunc(log(2)));

      %local Er;

      %let Er=0;

       %do i=1 %to &r;

       %let Er=%sysevalf(&Er+ &&N_&i._s * &&E_&i / &N);

       %end;

      %let &M_Value=%sysevalf(1 - &Er/&E);

       %return;

%mend;

%macro CalcMerit(BinDS, ix, M_Value);


%local n_11 n_12 n_21 n_22 n_1s n_2s n_s1 n_s2;

proc sql noprint;

 select sum(Ni1) into :n_11 from &BinDS where i<=&ix;

 select sum(Ni1) into :n_21 from &BinDS where i> &ix;

 select sum(Ni2) into : n_12 from &BinDS where i<=&ix ;

 select sum(Ni2) into : n_22 from &binDS where i> &ix ;

 select sum(total) into :n_1s from &BinDS where i<=&ix ;

 select sum(total) into :n_2s from &BinDS where i> &ix ;

 select sum(Ni1) into :n_s1 from &BinDS;

 select sum(Ni2) into :n_s2 from &BinDS;

quit;

%local N E1 E2 E Er;

      %let N=%eval(&n_1s+&n_2s);

      %let E1=%sysevalf(-( (&n_11/&n_1s)*%sysfunc(log(%sysevalf(&n_11/&n_1s))) +

                                    (&n_12/&n_1s)*%sysfunc(log(%sysevalf(&n_12/&n_1s)))) / %sysfunc(log(2)) ) ;

      %let E2=%sysevalf(-( (&n_21/&n_2s)*%sysfunc(log(%sysevalf(&n_21/&n_2s))) +

                                    (&n_22/&n_2s)*%sysfunc(log(%sysevalf(&n_22/&n_2s)))) / %sysfunc(log(2)) ) ;

      %let E =%sysevalf(-( (&n_s1/&n  )*%sysfunc(log(%sysevalf(&n_s1/&n   ))) +

                                     (&n_s2/&n  )*%sysfunc(log(%sysevalf(&n_s2/&n   )))) / %sysfunc(log(2)) ) ;

      %let Er=%sysevalf(1-(&n_1s*&E1+&n_2s*&E2)/(&N*&E));

      %let &M_value=&Er;

      %return;

%mend;

%macro BestSplit(BinDs, BinNo);

%local mb i value BestValue BestI;

proc sql noprint;

 select count(*) into: mb from &BinDs where Bin=&BinNo;

quit;

%let BestValue=0;

%let BestI=1;

%do i=1 %to %eval(&mb-1);

  %let value=;

  %CalcMerit(&BinDS, &i, Value);

  %if %sysevalf(&BestValue<&value) %then %do;

      %let BestValue=&Value;

        %let BestI=&i;

         %end;

%end;

data &BinDS;

 set &BinDS;

  if i<=&BestI then Split=1;

  else Split=0;

drop i;

run;

proc sort data=&BinDS;

by Split;

run;

data &BinDS;

retain i 0;

set &BinDs;

 by Split;

 if first.split then i=1;

 else i=i+1;

run;

%mend;

%macro CandSplits(BinDS, NewBins);

proc sort data=&BinDS;

by Bin PDV1;

run;

%local Bmax i value;

proc sql noprint;

 select max(bin) into: Bmax from &BinDS;

%do i=1 %to &Bmax;

%local m&i;

   create table Temp_BinC&i as select * from &BinDS where Bin=&i;

   select count(*) into:m&i from Temp_BinC&i;

%end;

   create table temp_allVals (BinToSplit num, DatasetName char(80), Value num);

run;quit;

%do i=1 %to &Bmax;

 %if (&&m&i>1) %then %do;  

  %BestSplit(Temp_BinC&i, &i);

  data temp_trysplit&i;

    set temp_binC&i;

      if split=1 then Bin=%eval(&Bmax+1);

  run;

  Data temp_main&i;

   set &BinDS;

   if Bin=&i then delete;

  run;

  Data Temp_main&i;

    set temp_main&i temp_trysplit&i;

  run;

  %let value=;

 %GValue(temp_main&i,  Value);

 proc sql noprint;

  insert into temp_AllVals values(&i, "temp_main&i", &Value);

 run;quit;

 %end;

%end;

proc sort data=temp_allVals;

by descending value;

run;

data _null_;

 set temp_AllVals(obs=1);

 call symput("bin", compress(BinToSplit));

run;

Data &NewBins;

 set Temp_main&Bin;

 drop split;

run;

/* Clean the workspace */

/*proc datasets nodetails nolist library=work;*/

/* delete temp_AllVals %do i=1 %to &Bmax; Temp_BinC&i  temp_TrySplit&i temp_Main&i %end; ; */

/*run;*/

/*quit;*/

%mend;


%macro BinContVar(DSin, IVVar, DVVar, MMax, Acc, DSVarMap);

%local VarMax VarMin;

proc sql noprint;

 select min(&IVVar), max(&IVVar) into :VarMin, :VarMax from &DSin;

quit;

%local Mbins i MinBinSize;

%let Mbins=%sysfunc(int(%sysevalf(1.0/&Acc)));

%let MinBinSize=%sysevalf((&VarMax-&VarMin)/&Mbins);

%do i=1 %to %eval(&Mbins);

 %local Lower_&i Upper_&i;

 %let Upper_&i = %sysevalf(&VarMin + &i * &MinBinSize);

 %let Lower_&i = %sysevalf(&VarMin + (&i-1)*&MinBinSize);

%end;

%let Lower_1 = %sysevalf(&VarMin-0.0001);

%let Upper_&Mbins=%sysevalf(&VarMax+0.0001);

data Temp_DS;

 set &DSin;

 %do i=1 %to %eval(&Mbins-1);

  if &IVVar>=&&Lower_&i and &IVVar < &&Upper_&i Then Bin=&i;

 %end;

  if &IVVar>=&&Lower_&Mbins and &IVVar <= &&Upper_&MBins Then Bin=&MBins;

 keep &IVVar &DVVar Bin;

run;

data temp_blimits;

 %do i=1 %to %Eval(&Mbins-1);

   Bin_LowerLimit=&&Lower_&i;

   Bin_UpperLimit=&&Upper_&i;

   Bin=&i;

   output;

 %end;

   Bin_LowerLimit=&&Lower_&Mbins;

   Bin_UpperLimit=&&Upper_&Mbins;

   Bin=&Mbins;

   output;

run;

proc sort data=temp_blimits;

by Bin;

run;

proc freq data=Temp_DS noprint;

 table Bin*&DVvar /out=Temp_cross;

 table Bin /out=Temp_binTot;

 run;

proc sort data=temp_cross;

 by Bin;

run;

proc sort data= temp_BinTot;

by Bin;

run;

data temp_cont;

merge Temp_cross(rename=count=Ni2 ) temp_BinTot(rename=Count=total) temp_BLimits ;

by Bin;

Ni1=total-Ni2;

PDV1=bin;

label  Ni2= total=;

if Ni1=0 then output;

else if &DVVar=1 then output;

drop percent &DVVar;

run;

data temp_contold;

set temp_cont;

run;


proc sql noprint;

%local mx;

 %do i=1 %to &Mbins;

  select count(*) into : mx from Temp_cont where Bin=&i;

  %if (&mx>0) %then %do;

 select Ni1, Ni2, total, bin_lowerlimit, bin_upperlimit into  :Ni1,:Ni2,:total, :bin_lower, :bin_upper

  from temp_cont where Bin=&i;

  %if (&i=&Mbins) %then %do;

  select max(bin) into :i1 from temp_cont where Bin<&Mbins;

    %end;

  %else %do;

   select min(bin) into :i1 from temp_cont where Bin>&i;

   %end;

   %if (&Ni1=0) or (&Ni2=0) or (&total=0) %then %do;

   update temp_cont set Ni1=Ni1+&Ni1 ,

    Ni2=Ni2+&Ni2 ,

     total=total+&Total

   where bin=&i1;

   %if (&i<&Mbins) %then %do;

  update temp_cont set Bin_lowerlimit = &Bin_lower

     where bin=&i1;

   %end;

   %else %do;

     update temp_cont set Bin_upperlimit = &Bin_upper

             where bin=&i1;

   %end;

    delete from temp_cont where bin=&i;

   %end;

  %end;

%end;

quit;

proc sort data=temp_cont;

by pdv1;

run;

%local m;

data temp_cont;

 set temp_cont;

 i=_N_;

 Var=bin;

 Bin=1;

 call symput("m", compress(_N_));

run;


%local Nbins ;

%let Nbins=1;  

%DO %WHILE (&Nbins <&MMax);

      %CandSplits(temp_cont, Temp_Splits);

      Data Temp_Cont;

        set Temp_Splits;

      run;

   %let NBins=%eval(&NBins+1);

%end;


data temp_Map1 ;

 set temp_cont(Rename=Var=OldBin);

 drop Ni2 PDV1 Ni1 i ;

 run;

proc sort data=temp_Map1;

by Bin OldBin ;

run;


data temp_Map2;

 retain  LL 0 UL 0 BinTotal 0;

 set temp_Map1;

by Bin OldBin;

Bintotal=BinTotal+Total;

if first.bin then do;

  LL=Bin_LowerLimit;

  BinTotal=Total;

    End;

if last.bin then do;

 UL=Bin_UpperLimit;

 output;

end;

drop Bin_lowerLimit Bin_upperLimit Bin OldBin total;

 run;

proc sort data=temp_map2;

by LL;

run;

data &DSVarMap;

set temp_map2;

Bin=_N_;

run;

/* Clean the workspace */

/*proc datasets nodetails library=work nolist;*/

/* delete temp_bintot temp_blimits temp_cont temp_contold temp_cross temp_ds temp_map1*/

/*    temp_map2 temp_splits;*/

/*run; quit;*/

%mend;

%macro ApplyMap2(DSin, VarX, NewVarX, DSVarMap, DSout);


%local m i;

proc sql noprint;

 select count(Bin) into:m from &DSVarMap;

quit;

%do i=1 %to &m;

 %local Upper_&i Lower_&i Bin_&i;

%end;

data _null_;

 set &DSVarMap;

  call symput ("Upper_"||left(_N_), UL);

  call symput ("Lower_"||left(_N_), LL);

  call symput ("Bin_"||left(_N_), Bin);

run;

Data &DSout;

 set &DSin;

 IF &VarX < &Upper_1 Then &NewVarX=&Bin_1;

 %do i=2 %to %eval(&m-1);

   if &VarX >= &&Lower_&i and &VarX < &&Upper_&i Then &NewVarX=&&Bin_&i;

 %end;

   if &VarX >= &&Lower_&i  Then &NewVarX=&&Bin_&i;

   DROP &VarX.;

Run;

%mend;

 %macro var_namelist(data=,coltype=,tarvar=,dsor=);

    %let lib=%upcase(%scan(&data.,1,'.'));

    %let dname=%upcase(%scan(&data.,2,'.'));

    %global var_list var_num;

    proc sql ;

      create table &dsor. as

        select name

        from sashelp.VCOLUMN

        where left(libname)="&lib." and left(memname)="&dname." and type="&coltype." and lowcase(name)^=lowcase("&tarvar.") and lowcase(name)^="appl_id";

    quit;

%mend;


%macro pub_best(data=,tarvar=,MMax=,ACC=,DSout=);

proc datasets lib=work;

delete _all_;

run;

%var_namelist(data=&data.,coltype=num,tarvar=&tarvar.,dsor=aa);

  data _null_;

   set aa;

   call symput (compress("var"||left(_n_)),compress(name));

   call symput(compress("n"),compress(_n_));

   run;

%do i=1 %to &n.;

    %put &&Var&i.;

    %BinContVar(DSin=&data., IVVar=&&Var&i., DVVar=&tarvar.,MMax=&MMax., ACC=&Acc., DSVarMap=AA_1);

    %ApplyMap2(DSin=&data., VarX=&&Var&i., NewVarX=N_&&Var&i., DSVarMap=AA_1, DSout=&DSout.);


%END;

%MEND;
[分割线]
这代码有点长,你就直接复制到sas里面看吧。
data=填入原始的数据集
tarvar=因变量;
MMax=分几组;
Acc=刚才是分几组,譬如你是1-100,那么你设定的是0.01,那就是分成100组,建议acc设定在0.01-0.05之间;
DSout=输出数据集。
代码是我调试好的,可以直接用。

数据分析咨询请扫描二维码

最新资讯
更多
客服在线
立即咨询