热线电话:13121318867

登录
首页精彩阅读SAS—基于熵的连续变量的离散化
SAS—基于熵的连续变量的离散化
2017-03-31
收藏

SAS—基于熵的连续变量的离散化

今天介绍下整个程序逻辑及sas代码的详细介绍。

首先宏 %BinContVard调用了宏%CandSplits;然后宏%CandSplits又调用宏

%BestSplit、%GValue;最后通过宏%ApplyMap应用于数据集。

下表是%BinContVar的参数

%BinContVar(Dsin,IVVar,DVVar,MMax,Acc,DsVarMap)

参数

描述

DSin

输入数据集

IVVar

连续自变量

DVVar

二元因变量

MMax

设定的分组数量

Acc

最小分段的百分比规模

DsVarMap

包含映射规则的输出数据集

首先,将初始数据集等距分为10段,然后把这些段数看作名义变量,基于熵方差利用最优二元分类法找出最优分群。

宏%bincontvar的主要是作用是连续变量的最优分段,嵌套了%CandSplits,这个宏的作用是对分段后的数据集在进行分群,并选出最优分群;宏%CandSplits嵌套了%BestSplits和%GValue两个宏:宏%BestSplits是找出最优分群,宏%GValue计算熵方差。

/*连续变量的最优分段*/

/*

1.找出连续变量的最大最小值;

2.对连续变量进行等距分段,并把这些段数看成名义变量;

3.对2所分段数进行最优分群,直到所设置的分群数

*/

%macro BinContVar(DSin, IVVar, DVVar, MMax, Acc, DSVarMap);


%local VarMax VarMin;

proc sql noprint;

select min(&IVVar), max(&IVVar) into :VarMin, :VarMax from &DSin;

quit;

%local Mbins i MinBinSize;

%let Mbins=%sysfunc(int(%sysevalf(1.0/&Acc)));/*设置等距分段数*/

%let MinBinSize=%sysevalf((&VarMax-&VarMin)/&Mbins);/*每段的长度*/

/*定义每段后每段的最大最小值*/

%do i=1 %to %eval(&Mbins);

%local Lower_&i Upper_&i;

%let Upper_&i = %sysevalf(&VarMin + &i * &MinBinSize);

%let Lower_&i = %sysevalf(&VarMin + (&i-1)*&MinBinSize);

%end;

%let Lower_1 = %sysevalf(&VarMin-0.0001);

%let Upper_&Mbins=%sysevalf(&VarMax+0.0001);

/*对连续变量 income 进行等距分段*/

data Temp_DS;

set &DSin;

%do i=1 %to %eval(&Mbins-1);

if &IVVar>=&&Lower_&i and &IVVar < &&Upper_&i Then Bin=&i;

%end;

if &IVVar>=&&Lower_&Mbins and &IVVar <= &&Upper_&MBins Then Bin=&MBins;

run;

/*计算出等距分段的每段最值*/

data temp_blimits;

%do i=1 %to %Eval(&Mbins-1);

Bin_LowerLimit=&&Lower_&i;

Bin_UpperLimit=&&Upper_&i;

Bin=&i;

output;

%end;

Bin_LowerLimit=&&Lower_&Mbins;

Bin_UpperLimit=&&Upper_&Mbins;

Bin=&Mbins;

output;

run;

proc sort data=temp_blimits;

by Bin;

run;

/*找出每段分段对应的二元自变量每个类别的频数*/

proc freq data=Temp_DS noprint;

table Bin*&DVvar /out=Temp_cross;

table Bin /out=Temp_binTot;

run;

proc sort data=temp_cross;

by Bin;

run;

proc sort data= temp_BinTot;

by Bin;

run;


data temp_cont;

merge Temp_cross(rename=count=Ni2 )temp_BinTot(rename=Count=total) temp_BLimits ;/*Ni2:每个分段下对应类别的频数 total:每个分段下的总频数*/

by Bin;

Ni1=total-Ni2;

PDV1=bin;

label Ni2= total=;

if Ni1=0 then output;

else if &DVVar=1 then output;

drop percent &DVVar;

run;


data temp_contold;

set temp_cont;

run;

/*合并所有含有ni1、ni2 、total= 0 的分段*/

proc sql noprint;

%local mx;

%do i=1 %to &Mbins;

select count(*) into : mx from Temp_cont where Bin=&i;

%if (&mx>0) %then %do;

select Ni1, Ni2, total, bin_lowerlimit, bin_upperlimit into

:Ni1,:Ni2,:total, :bin_lower, :bin_upper

from temp_cont where Bin=&i;

%if (&i=&Mbins) %then %do;

select max(bin) into :i1 from temp_cont where Bin<&Mbins;

%end;

%else %do;

select min(bin) into :i1 from temp_cont where Bin>&i;

%end;

%if (&Ni1=0) or (&Ni2=0) or (&total=0) %then %do;

update temp_cont set

Ni1=Ni1+&Ni1 ,

Ni2=Ni2+&Ni2 ,

total=total+&Total

where bin=&i1;

%if (&i<&Mbins) %then %do;

update temp_cont set Bin_lowerlimit = &Bin_lower where bin=&i1;

%end;

%else %do;

update temp_cont set Bin_upperlimit = &Bin_upper where bin=&i1;

%end;

delete from temp_cont where bin=&i;

%end;

%end;

%end;

quit;


proc sort data=temp_cont;

by pdv1;

run;

%local m;

/*将所有类别定义为宏变量m*/

data temp_cont;

set temp_cont;

i=_N_;

Var=bin;

Bin=1;

call symput("m", compress(_N_));

run;


%local Nbins ;

%let Nbins=1;

%DO %WHILE (&Nbins <&MMax);

/*从所有候选分群中根据熵选择最优分群*/

%CandSplits(temp_cont, Temp_Splits);

Data Temp_Cont;

set Temp_Splits;

run;

%let NBins=%eval(&NBins+1);

%end;


data temp_Map1 ;

set temp_cont(Rename=Var=OldBin);

drop Ni2 PDV1 Ni1 i ;

run;

proc sort data=temp_Map1;

by Bin OldBin ;

run;


data temp_Map2;

retain LL 0 UL 0 BinTotal 0;

set temp_Map1;

by Bin OldBin;

Bintotal=BinTotal+Total;

if first.bin then do;

LL=Bin_LowerLimit;

BinTotal=Total;

End;

if last.bin then do;

UL=Bin_UpperLimit;

output;

end;

drop Bin_lowerLimit Bin_upperLimit Bin OldBin total;

run;数据分析师培训

proc sort data=temp_map2;

by LL;

run;

data &DSVarMap;

set temp_map2;

Bin=_N_;

run;

%mend;


数据分析咨询请扫描二维码

最新资讯
更多
客服在线
立即咨询