This post was kindly contributed by SAS Programming for Data Mining Applications - go there to comment and to read the full post. |
In the 2010 SASware Ballot®, a dedicated PROC for Randomized SVD was among the options. While an official SAS PROC will not be available in the immediate future as well as in older SAS releases, it is fairly simple to implement this algorithm using existing SAS/STAT procedures.
Randomized SVD will be useful for large scale, high dimension data mining problems, for instance Text Mining. In SAS/Base and SAS/STAT, lack of sparse matrix operation capability puts any serious Text Mining task at the edge of infeasibility, such as using LSI or NMF algorithms. Randomized SVD provides an economic alternate solution by sacrificing a little accuracy which is bounded under the three sampling schema proposed by the authors [1], while the code below demos sampling schema 1.
/* Randomized SVD with sampling schema 1. */
%let dim=2048;
%let nobs=1e4;
%let s=256;
data matrix;
array _x{*} x1-x&dim;
do id=1 to &nobs;
do _j=1 to dim(_x); _x[_j]=sin(mod(id, _j))+rannor(id); end;
output;
drop _j;
end;
run;
%let datetime_start = %sysfunc(TIME()) ;
%let time=%sysfunc(datetime(), datetime.); %put &time;
data seed;
array _x{*} x1-x&dim;
do _j=1 to dim(_x); _x[_j]=0; end;
output;
stop;
run;
proc fastclus data=matrix seed=seed out=norm(keep=ID DISTANCE)
maxiter=0 maxclusters=1 noprint replace=none;
var x1-x&dim;
run;
data normv/ view=normv;
set norm(keep=DISTANCE);
DISTANCE2=DISTANCE**2;
drop DISTANCE;
run;
proc means data=normv noprint;
var DISTANCE2;
output out=matrixnorm sum(DISTANCE2)=Frobenius_sqr;
run;
data prob;
set matrixnorm ;
retain Frobenius_sqr;
do until (eof);
set norm end=eof;
_rate_=DISTANCE**2/Frobenius_sqr;
keep ID _rate_;
output;
end;
run;
data matrixv/view=matrixv;
merge matrix prob(keep=_rate_);
run;
ods select none;
proc surveyselect data=matrixv out=matrixsamp(drop=SamplingWeight ExpectedHits NumberHits _rate_)
sampsize=&s method=pps_wr outhits ;
size _rate_;
run;
ods select all;
proc transpose data=matrixsamp out=matrixsamp;
var x1-x&dim;
run;
proc princomp data=matrixsamp outstat=testv(where=(_type_ in ("USCORE")))
noint cov noprint;
var col1-col&s;
run;
data testV_t/view=testV_t;
retain _TYPE_ 'PARMS';
set testv(drop=_TYPE_);
run;
proc score data=matrixsamp score=testV_t type=parms
out=SW(keep=ID Prin:);
var col1-col&s;
run;
data seed;
array _s{*} prin1-prin&s;
do _j=1 to dim(_s); _s[_j]=0; end;
drop _j; output; stop;
run;
proc fastclus data=SW seed=seed maxiter=0 maxc=1 replace=NONE out=SW2(drop=CLUSTER) noprint;
var prin1-prin&s;
run;
data HHT;
set SW2;
array _x{*} prin1-prin&s;
do _j=1 to dim(_x); _x[_j]=(_x[_j]/distance)**2; end;
drop _j distance;
run;
proc transpose data=HHT out=HHT2(drop=_LABEL_);
run;
data HHT2;
_TYPE_='PARMS';
set HHT2;
rename COL1-COL2048=x1-X2048;
run;
proc score data=matrix score=HHT2 type=parms out=P(drop=x:);
var x:;
run;
%let time=%sysfunc(datetime(), datetime.); %put &time;
%put PROCESSING TIME: %sysfunc(putn(%sysevalf(%sysfunc(TIME())-&datetime_start.),mmss.)) (mm:ss) ;
options notes source;
Reference:
[1], P. Drineas and M. W. Mahoney, “Randomized Algorithms for Matrices and Massive Data Sets”, Proc. of the 32nd Annual Conference on Very Large Data Bases (VLDB), p. 1269, 2006.
This post was kindly contributed by SAS Programming for Data Mining Applications - go there to comment and to read the full post. |