/* **************************************************************** * Name: tmsvdcut * * Usage: %tmsvdcut(out=,sigma=,key=, alpha=); * EM4.3 code node: %tmsvdcut(out=&_mac_2,sigma=emdata.sig_****,key=&_mac_3); * * Purpose: To assist users in determining how many svd dimensions to use for * their data. * * Parms: OUT = the out data set from SAS Text Miner. If running in * EM4.3, you can use a SAS code node immediately after the * Text Miner tool, and set it to &_mac_2, the macro var for the out * data set. If using EM5.1, run the Text Miner tool, look at * the results, and find the data set that contains _OUT_ in * its name. * * SIGMA= the sigma data set from the tool. Look at the log * for the tool, and choose the data set that contains * "_sigma" in EM5.1 or "sig_" in EM4.3. * * KEY= the key data set used in the tool. If using a code * node in EM4.3, this is the &_mac_3 data set. In EM5.1, look * at the log and use the data set containing "_TERMS". * * ALPHA= the significance level to require for a cutoff to be * noted, default = .01. Lower numbers for this will cause fewer * potential cutoffs to be noted. * * Output: Running the macro will print messages in your log to help * you in determining the number of dimensions to use. Potential * cutoffs will be noted in the log, by printing out the following * line for each one: * cutoff at ; % signal: ; dif= ; p= * where * : the number of dimensions to cut off at * : what percent of the total variance is accounted for * by this number of dimensions. * : The difference between this singular value and the * one after it. * : chances that this singular value difference * could just refer to noise. This will always be less * than the ALPHA parameter specified. * * Suggested usage: Run the text miner tool with a lot of SVD * dimensions (e.g. 500). Then run the %tmsvdcut macro. Use the * largest value cutoff that seems reasonable. * * History: * 05Oct04 Initial Coding [cox] * * Notes: * End * ************************************************************** */ %macro tmsvdcut(out=,sigma=,key=,alpha=.01); proc sort data=&key (where=(parent ne .)) out=_tmpchild(keep=parent key); by key; data _modout (keep=_termnum_ _document_ _count_); merge _tmpchild (rename=(key=_termnum_) in=a) &out (in=b); by _termnum_; if b; if a then _termnum_ = parent; /* Now need to remove duplicate obs for same parent */ proc sort data=_modout; by _termnum_ _document_; data _modout (keep=_termnum_ _document_ _count_); retain totcount; set _modout; by _termnum_ _document_; if first._document_ then do; totcount = _count_; end; else totcount = totcount + _count_; if last._document_ then do; _count_ = totcount; output; end; run; /* Calculate the total sums of squares of the out matrix. This will be equal to the sums of the squares of the eigenvalues (sigma matrix) for the non-truncated SVD. This is for calculating percent of total variance accounted for by the first m singular values */ proc sql; select sum((log2(_count_+1)*weight)**2) into :totsqrs from _modout, &key where parent = . and keep='Y' and key = _termnum_; quit; data _sigma; retain tot totsqr; set σ format totpct percent7.1; dif1 = lag1(svalues) - svalues; if _n_ = 1 then do; tot = 0; totsqr = 0; end; * if _n_ <= 400; pos = _n_; tot = svalues + tot; totsqr = totsqr + svalues*svalues; totpct = totsqr/&totsqrs; run; proc sort data=_sigma; by descending pos; data _sigma; retain diftot diftotsqr firstpos; set _sigma; if _n_ = 1 then do; diftot = 0; diftotsqr = 0; firstpos = pos; end; diftot = diftot + dif1; diftotsqr = diftotsqr + dif1*dif1; if _n_ > 25 and _n_ < firstpos then do; meandif = diftot/_n_; var = diftotsqr/_n_ - meandif*meandif; stddif = sqrt(var); lambda = (meandif / stddif)**2; beta = var / meandif; prob = 1 - cdf('gamma',dif1, lambda, beta); * t = (dif1 - meandif)/stddif; if prob < &alpha then do; cutoff = pos-1; put 'cutoff at ' cutoff '; % signal:' totpct '; dif=' dif1 '; p=' prob; end; else cutoff = 0; end; run; %mend; /* %tmsvdcut(out=emdata.out_ps49,sigma=emdata.sig_ycco,key=emdata.key_3r9i,alpha=.01); */