This post was kindly contributed by SAS Programming for Data Mining Applications - go there to comment and to read the full post. |
SVD is at the heart of many modern machine learning algorithms. As a computing vehicle for PCA, SVD can be obtained using PROC PRINCOMP on the covariance matrix of a given matrix withou correction for intercept. With SVD, we are ready to carry out many tasks that are very useful but not readily available in SAS/STAT, such as TextMining using LSI [default algorithm used in SAS TextMiner [1]], multivariate Time Series Analysis using MSSA, Logistic-PLS, etc.
I also highly recommend the book “Principal Component Analysis 2nd Edition” by I. T. Jolliffe. Prof. Jollliffe smoothly gave a thorough review of PCA and its applications in various fields, and provided a road map for further research and reading.
%macro SVD(
input_dsn,
output_V,
output_S,
output_U,
input_vars,
ID_var,
nfac=0
);
%local blank para EV USCORE n pos dsid nobs nstmt
shownote showsource ;
%let shownote=%sysfunc(getoption(NOTES));
%let showsource=%sysfunc(getoption(SOURCE));
options nonotes nosource;
%let blank=%str( );
%let EV=EIGENVAL;
%let USCORE=USCORE;
%let n=%sysfunc(countW(&input_vars));
%let dsid=%sysfunc(open(&input_dsn));
%let nobs=%sysfunc(attrn(&dsid, NOBS));
%let dsid=%sysfunc(close(&dsid));
%if &nfac eq 0 %then %let nstmt=␣
%else %do;
%let x=%sysfunc(notdigit(&nfac, 1));
%if &x eq 0 %then %do;
%let nfac=%sysfunc(min(&nfac, &n));
%let nstmt=%str(n=&nfac);
%end;
%else %do;
%put ERROR: Only accept non-negative integer.;
%goto exit;
%end;
%end;
%if &output_U ne %str() %then %do;
%let outstmt= out=&output_U.(keep=&ID_var Prin:) std;
%end;
%else %do;
%let outstmt=␣
%end;
%let options=noint cov noprint &nstmt;
proc princomp data=&input_dsn
/* out=&input_dsn._score */
&outstmt
outstat=&input_dsn._stat(where=(_type_ in ("&USCORE", "&EV"))) &options;
var &input_vars;
run;
data &output_S;
set &input_dsn._stat;
format Number 7.0;
format EigenValue Proportion Cumulative 7.4;
keep Number EigenValue Proportion Cumulative;
where _type_="&EV";
array _X{&n} &input_vars;
Total=sum(of &input_vars);
Cumulative=0;
do Number=1 to dim(_X);
EigenValue=_X[number];
Proportion=_X[Number]/Total;
Cumulative=Cumulative+Proportion;
output;
end;
run;
%if &output_V ne %str() %then %do;
proc transpose data=&input_dsn._stat(where=(_TYPE_="&USCORE"))
out=&output_V.(rename=(_NAME_=variable))
name=_NAME_;
var &input_vars;
id _NAME_;
format &input_vars 8.6;
run;
%end;
/* calculate Singular Values */
%if &output_S ne %str() %then %do;
data &output_S;
set &input_dsn._stat ;
where _TYPE_="EIGENVAL";
array _s{*} &input_vars;
array _x{&nfac, 3} _temporary_;
Total=sum(of &input_vars, 0);
_t=0;
do _i=1 to &nfac;
_x[_i, 1]=_s[_i]; _x[_i, 2]=_s[_i]/Total;
if _i=1 then _x[_i, 3]=_x[_i, 2];
else _x[_i, 3]=_x[_i-1, 3]+_x[_i, 2];
_t+sqrt(_x[_i, 2]);
end;
do _i=1 to &nfac;
Number=_i;
EigenValue=_x[_i, 1]; Proportion=_x[_i, 2]; Cumulative=_x[_i, 3];
S=sqrt(_x[_i, 2])/_t; SinguVal=sqrt(_x[_i, 1] * &nobs);
keep Number EigenValue Proportion Cumulative S SinguVal;
output;
end;
run;
%end;
/* calculate U=XV/S */
%if &output_U ne %str() %then %do;
data &output_U;
set &output_U nobs=ntotal;
array _denum{1} _temporary_;
array _A{*} Prin1-Prin&nfac;
if _n_=1 then _denum[1]=sqrt(ntotal);
do _j=1 to dim(_A);
_A[_j]=_A[_j]/_denum[1];
end;
keep &ID_var Prin1-Prin&nfac ;
run;
%end;
%exit:
options &shownote &showsource;
%mend;
Reference:
[1] Albright, Russ, “Taming Text with the SVD”, SAS Institute Inc., Cary, NC, available at :
http://ftp.sas.com/techsup/download/EMiner/TamingTextwiththeSVD.pdf
[2] Jolliffe, I. T. , “Principal Component Analysis”, 2nd Ed., Springer Series in Statistics, 2002
This post was kindly contributed by SAS Programming for Data Mining Applications - go there to comment and to read the full post. |