/* **************************************************************** * Copyright (C) 2004 by SAS Institute Inc., Cary, NC 27513 * * Name: textsyn.sas * Support: cox James A. Cox * Product: SAS Text Miner * Language: Sas Macro * Script: * * Usage: %textsyn( termds= * , synds= * , 0 ; firstchar = substr( term, pos, 1 ) ; if numdocs >= &MNPARDOC then output _parent ; if numdocs <= &MXCHDDOC & parent = . then do ; label childterm = 'child term' ; childterm = term ; childndocs = numdocs ; childlen = length( trim( term )) ; dict = 'N' ; output _child ; end ; run ; %if &DICT ne %then %do ; proc sort data=_child ; by childterm ; run ; data _child ; merge &DICT( rename=( term=childterm ) in=a ) _child( in=b ) ; by childterm ; if a & b then dict = 'Y' ; if b then output _child ; run ; %end ; proc sql ; create table _syncands as select a.firstchar , a.numdocs , a.term , a.role , b.childterm , b.childndocs , b.childlen , b.key , b.dict from _parent a , _child b where a.firstchar = b.firstchar & a.numdocs > b.childndocs & a.role = b.role ; quit ; data _syncands( keep=parent term numdocs childndocs minsped key category dict ) ; set _syncands ; parent = term ; cterm = childterm ; category = role ; if substr( term, length( term ), 1 ) = '.' then term = substr( term, 1, length( term ) - 1 ) ; if substr( childterm,length( childterm ), 1 ) = '.' then childterm = substr( childterm, 1, length( childterm ) - 1 ) ; minlen = min( length( term ), length( childterm )) ; /* Delete candidates where child is parent with other info appended */ if length( childterm ) > length( parent )+1 then if substr( childterm,1, length( parent )) = parent then delete ; /* lendiff = abs( length( childterm ) - length( term )) */ spedis1 = int((spedis( term, childterm ) * length( term ))/ minlen ); spedis2 = int((spedis( childterm, term ) * length( childterm ))/minlen ); minsped = min( spedis1, spedis2 ); if dict = 'Y' then minsped = minsped * &DICTPEN ; /* For multi-word terms, need to increase minsped */ if scan( term, 2, ' ') ne ' ' or scan( childterm, 2, ' ') ne ' ' then minsped = minsped * &MULTIPEN ; if minsped <= &MAXSPED ; term = cterm ; run ; proc sort data=_syncands ; by category term minsped descending numdocs ; run ; data _syncands ; set _syncands ; by category term ; if first.term ; run ; /* show examples of word in context */ %if ( &DOCDS ne & &OUTDS ne & &TEXTVAR ne ) %then %do ; proc sql ; create table &SYNDS as select term , parent , category , childndocs , numdocs , &TEXTVAR , minsped , dict from _syncands as a , &DOCDS as b , &OUTDS as c where ( c._document_ = b._document_) & ( a.key = c._termid_) ; quit ; proc sort data=&SYNDS ; by category term ; run ; data &SYNDS( drop=&TEXTVAR tmpstr tmpterm pos i pos2 lastfirst) ; length example $300 tmpstr $32000 tmpterm $200 ; retain lastfirst ; set &synds ; by category term ; if first.term or lastfirst then do ; pos = index( lowcase( &TEXTVAR), trim( term )) ; if pos then do ; example = '!!' ||substr( &TEXTVAR, pos, length( term )) || '!!' ; if pos > 1 then do ; tmpstr = reverse( substr( &TEXTVAR, 1, pos - 1 )) ; do i=1 to &CONTEXT + 1 ; tmpterm = scan( tmpstr, i, ' ') ; end ; pos2 = indexw( tmpstr, trim( tmpterm )) ; if tmpterm ne ' ' and pos2 > 0 then example = '...' || trim( substr( &textvar, pos - pos2 + 1, pos2 - 1 )) || ' ' || trim( example) ; else example = substr( &TEXTVAR, 1, pos - 1 ) || ' ' || trim( example ) ; end ; if pos + length( term) < length( &TEXTVAR ) then do ; tmpstr=substr( &TEXTVAR, pos + length( term )) ; do i=1 to &CONTEXT + 1 ; tmpterm = scan( tmpstr, i, ' ') ; end ; pos2 = indexw( tmpstr, trim( tmpterm )) ; if tmpterm ne ' ' and pos2 < length( tmpstr) then example = trim( example) || trim( substr( tmpstr,1,pos2 - 1 )) || '...' ; else example = trim( example) || tmpstr ; end ; end ; else do ; example = &TEXTVAR ; put term 'not found in ' example ; end ; lastfirst = first.term ; output ; end ; else lastfirst = 0 ; run ; data &SYNDS( drop=example ) ; length example1 example2 $300 ; retain example1 ; set &SYNDS ; by category term ; if first.term then example1 = example ; else example2 = example ; if last.term then output ; run ; %end ; %else %do ; data &SYNDS( drop=key ) ; set _syncands ; run ; %end ; %mend TEXTSYN ; /*** %textsyn( termds=&_MAC_3 , docds=&_MAC_1 , outds=&_MAC_2 , textvar=cnt_desc , mnpardoc=3 , mxchddoc=6 , synds=sampsio.autosyns , dict=sampsio.engdict , maxsped=15 ) ; ***/