天天看點

Bagging算法在SAS中的實作

原文位址:Bagging算法在SAS中的實作作者:文穗
           
%macro bagging(data = , y = , numx = , catx = , ntrees = 10);
***********************************************************;
* THIS SAS MACRO IS AN ATTEMPT TO IMPLEMENT BAGGING       *;
* PROPOSED BY LEO BREIMAN (1996)                          *;
* ======================================================= *;
* PAMAMETERS:                                             *;
*  DATA   : INPUT SAS DATA TABLE                          *;
*  Y      : RESPONSE VARIABLE WITH 0/1 VALUE              *;
*  NUMX   : A LIST OF NUMERIC ATTRIBUTES                  *;
*  CATX   : A LIST OF CATEGORICAL ATTRIBUTES              *;
*  NTREES : # OF TREES TO DO THE BAGGING                  *;
* ======================================================= *;
* OUTPUTS:                                                *;
*  1. A SAS CATALOG FILE NAMED "TREEFILES" IN THE WORKING *;
*     DIRECTORY CONTAINING ALL SCORING FILES IN BAGGING   *;
*  2. A LST FILE SHOWING ks STATISTICS OF THE BAGGING     *;
*     CLASSIFIER AND EACH TREE CLASSIFIER                 *;
* ======================================================= *;
* CONTACT:                                                *;
*  [email protected], LOSS FORECASTING & RISK MODELING    *;
***********************************************************;

options mprint mlogic nocenter nodate nonumber;

*** a random seed value subject to change ***;
%let seed = 20110613;

*** assign a library to the working folder ***;
libname _path '';

*** generate a series of random seeds ***;
data _null_;
  do i = 1 to &ntrees;
    random = put(ranuni(&seed) * (10 ** 8), 8.);
    name   = compress("random"||put(i, 3.), ' ');
    call symput(name, random);
  end;
run;    

*** clean up catalog files in the library ***;
proc datasets library = _path nolist;
  delete TreeFiles tmp / memtype = catalog;
run;
quit;

proc sql noprint;
  select count(*) into :nobs from &data where &y in (1, 0);
quit;

data _tmp1 (keep = &y &numx &catx _id_);
  set &data;
  _id_ + 1;
run;
  
%do i = 1 %to &ntrees;
  %put &&random&i;

  *** generate bootstrap samples for bagging ***;
  proc surveyselect data = _tmp1 method = urs n = &nobs seed = &&random&i
    out = sample&i(rename = (NumberHits = _hits)) noprint;
  run;
  
  *** generate data mining datasets for sas e-miner ***;
  proc dmdb data = sample&i out = db_sample&i dmdbcat = cl_sample&i;
    class &y &catx;
    var &numx;
    target &y;
    freq _hits;
  run;

  *** create a sas temporary catalog to contain sas output ***;
  filename out_tree catalog "_path.tmp.out_tree.source";

  *** create decision tree mimicking CART ***;
  proc split data = db_sample&i dmdbcat = cl_sample&i
    criterion    = gini
    assess       = impurity
    maxbranch    = 2
    splitsize    = 100
    subtree      = assessment
    exhaustive   = 0 
    nsurrs       = 0;
    code file    = out_tree;
    input &numx   / level = interval;
    input &catx   / level = nominal;
    target &y     / level = binary;
    freq _hits;
  run;  

  *** create a perminant sas catalog to contain all tree outputs ***;
  filename in_tree catalog "_path.TreeFiles.tree&i..source";

  data _null_;
    infile out_tree;
    input;
    file in_tree;
    if _n_ > 3 then put _infile_;
  run;

  *** score the original data by each tree output file ***;
  data _score&i (keep = p_&y.1 p_&y.0 &y _id_);
    set _tmp1;
    %include in_tree;
  run;

  *** calculate KS stat ***;
  proc printto new print = lst_out;
  run;

  ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));
  proc npar1way wilcoxon edf data = _score&i;
    class &y.;
    var p_&y.1;
  run;

  proc printto;
  run;

  %if &i = 1 %then %do;
    data _tmp2;
      set _score&i;
    run;

    data _ks;
      set _kstmp (keep = nvalue2);
      tree_id = &i;
      seed    = &&random&i;
      ks      = round(nvalue2 * 100, 0.0001);
    run;
  %end;    
  %else %do;
    data _tmp2;
      set _tmp2 _score&i;
    run;

    data _ks;
      set _ks _kstmp(in = a keep = nvalue2);
      if a then do;
        tree_id = &i;
        seed    = &&random&i;
        ks      = round(nvalue2 * 100, 0.0001);
      end;
    run;
  %end;    

%end;

*** aggregate predictions from all trees in the bag ***;
proc summary data = _tmp2 nway;
  class _id_;
  output out = _tmp3(drop = _type_ rename = (_freq_ = freq))
  mean(p_&y.1) =  mean(p_&y.0) =  mean(&y) = ;
run;

*** calculate bagging KS stat ***;
proc printto new print = lst_out;
run;

ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));
proc npar1way wilcoxon edf data = _tmp3;
  class &y;
  var p_&y.1;
run;

proc printto;
run;

data _ks;
  set _ks _kstmp (in = a keep = nvalue2);
  if a then do;
    tree_id = 0;
    seed    = &seed;
    ks      = round(nvalue2 * 100, 0.0001);
  end;
run;

proc sort data = _ks;
  by tree_id;
run;

proc sql noprint;
  select max(ks) into :max_ks from _ks where tree_id > 0;
  
  select min(ks) into :min_ks from _ks where tree_id > 0;

  select ks into :bag_ks from _ks where tree_id = 0;
quit;

*** summarize the performance of bagging classifier and each tree in the bag ***;
title "MAX KS = &max_ks, MIN KS = &min_ks, BAGGING KS = &bag_ks";
proc print data = _ks noobs;
  var tree_id seed ks;
run;
title;

proc datasets library = _path nolist;
  delete tmp / memtype = catalog;
run;
quit;

%mend bagging;

%let x1 = tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt
          tot_rev_line rev_util bureau_score ltv tot_income;

%let x2 = purpose;

libname data 'D:SAS_CODEbagging';

�gging(data = data.accepts, y = bad, numx = &x1, catx = &x2, ntrees = 10);