%macro TMSTART( DSNKEY /* name of key dataset produced by */ , DSNOUT /* name of startlist resulting from processing &DSNKEY */ , ATTRIBUTE= /* [optional] set of attributes in which acceptable terms have membership */ , KEEP=Y /* [optional] flag to indicate keeping terms which are marked KEEP=Y|N */ , MAXDOCS= /* [optional] max # of documents in which term may appear */ , MINDOCS=2 /* [optional] min # of documents in which term may appear */ , MAXFREQ= /* [optional] max # of times term may appear in document corpus */ , MINFREQ=2 /* [optional] min # of times term may appear in document corpus */ , MAXWEIGHT= /* [optional] upper bound on term weight for inclusion in startlist */ , MINWEIGHT=.0001 /* [optional] lower bound on term weight for inclusion in startlist */ , MINLEN=2 /* [optional] min term length (min # characters in a term) */ , NOCONTR=Y /* [optional] flag to remove contractions (terms that start w/ apostrophe */ , NOSPECL=Y /* [optional] flag to remove special chars (terms that start w/ & ! # @ ) */ , ROLE= /* [optional] set of roles in which acceptable terms must have membership */ ) ; /* Copyright (C) 2004 by SAS Institute Inc., Cary, NC 27513 * * Name: tmstart.sas * Support: robett Ross Bettinger * Product: SAS Text Miner * Language: Sas Macro */ /* PURPOSE: create start list of terms for SAS Text Miner(TM) * * NOTE: &DSNKEY is the key dataset produced by Text Miner. it contains the following variables: * Term Freq Numdocs Keep Weight Role Attribute * * NOTE: if &KEEP = Y then only terms which are marked Y for keeping are kept. * otherwise all terms are kept, including terms marked N * * NOTE: &MINLEN = 2 ensures that single-character terms are excluded from processing * it is assumed that single-character terms have little or no information value * * NOTE: if &NOCONTR = Y then contractions (defined to be terms that start with an apostrophe) * are deleted from the startlist * * NOTE: if &NOSPECL = Y then special chars (defined to be terms that start w/ & ! # @ ) * are deleted from the startlist * * EXAMPLE OF USE: * * %let DSNKEY = key_a1b2 ; * %let DSNOUT = SASDATA.startlist ; * * %let ATTRIBUTE = alpha entity ; *** omit 'unknown' attributes *** * %let MINFREQ = 3 ; * %let MINDOCS = 3 ; * %let ROLE = company noun noun_group product verb ; * * libname EMDATA 'C:\# Home\My SAS Files\9.1\EM Projects\Edmunds.com 4.3\emdata' ; * libname SASDATA 'C:\# Home\My SAS Files\9.1\Edmunds.com\SASData' ; * * %TMSTART( &DSNKEY * , SASDATA.&DSNOUT * , ATTRIBUTE= &ATTRIBUTE * , MINFREQ= &MINFREQ * , MINDOCS= &MINDOCS * , ROLE= &ROLE * ) * * NOTE: dataset SASDATA.&DSNOUT contains startlist terms suitable for Text Miner to use * since it contains the Term and Role variables after filtering has been performed */ %if %length( &ATTRIBUTE ) > 0 %then %let ATTRIBUTE = %upcase( &ATTRIBUTE ) ; %if %length( &KEEP ) > 0 %then %let KEEP = %eval ( %upcase( &KEEP ) = Y ) ; %if %length( &NOCONTR ) > 0 %then %let NOCONTR = %eval ( %upcase( &NOCONTR ) = Y ) ; %if %length( &NOSPECL ) > 0 %then %let NOSPECL = %eval ( %upcase( &NOSPECL ) = Y ) ; %if %length( &ROLE ) > 0 %then %let ROLE = %upcase( &ROLE ) ; /*############################################################################*/ /* begin executable statements /*############################################################################*/ data &DSNOUT( keep= term freq numdocs keep weight role attribute key ) ; set &DSNKEY ; if length( cats( term )) >= &MINLEN ; %if %length( &MAXWEIGHT ) > 0 %then %str( if weight <= &MAXWEIGHT ; ) ; %if %length( &MINWEIGHT ) > 0 %then %str( if weight >= &MINWEIGHT ; ) ; %if &KEEP %then %str( if keep = 'Y' ; ) ; %if &NOCONTR %then %str( if substr( term, 1, 1 ) ne "'" ; ) ; %if &NOSPECL %then %str( if index( '&!#@', substr( term, 1, 1 )) = 0 ; ) ; %if %length( &ATTRIBUTE ) > 0 %then %do ; %let CONTINUE = 1 ; %let I = 1 ; %let TOKEN = %scan( &ATTRIBUTE, 1 ) ; if %do %until( ^&CONTINUE ) ; upcase( attribute ) = "&TOKEN" %let I = %eval( &I + 1 ) ; %let TOKEN = %scan( &ATTRIBUTE, &I ) ; %let CONTINUE = %eval( %length( &TOKEN ) > 0 ) ; %if &CONTINUE %then %str( or ) ; %end ; %str( ; ) ; %end ; %if %length( &MAXFREQ ) > 0 %then %str( if freq <= &MAXFREQ ; ) ; %if %length( &MINFREQ ) > 0 %then %str( if freq >= &MINFREQ ; ) ; %if %length( &MAXDOCS ) > 0 %then %str( if numdocs <= &MAXDOCS ; ) ; %if %length( &MINDOCS ) > 0 %then %str( if numdocs >= &MINDOCS ; ) ; %if %length( &ROLE ) > 0 %then %do ; %let CONTINUE = 1 ; %let I = 1 ; %let TOKEN = %scan( &ROLE, 1 ) ; if %do %until( ^&CONTINUE ) ; upcase( role ) = "&TOKEN" %let I = %eval( &I + 1 ) ; %let TOKEN = %scan( &ROLE, &I ) ; %let CONTINUE = %eval( %length( &TOKEN ) > 0 ) ; %if &CONTINUE %then %str( or ) ; %end ; %str( ; ) ; %end ; run ; %mend TMSTART ;