/***************************************************************** * Name: tmfilter.sas (macro) * * Support: Russell Albright and Jim Cox * * Usage: %tmfilter( dataset=work.data, dir= ,destdir= , port=0, * host=localhost, numbytes=60, language=english, * url= , depth=2, norestrict=, links=, username="" , password ="", * ext=); * Purpose: To create a SAS data set containing pointers to documents * in a directory and all its subdirectories along with a summary * of the text (currently just the first numchars characters after * converting the text from its native form to html and then stripping * out the html tags). Documents of many different formats can be * handled by this macro. * * There are two basic goals you may have for the program. If you are * not planning to crawl the web, you do not need to specify the last three * parameters. If you do wish to crawl the web, the last four parameters * control what is passed to the tmcrawl macro. The names data set returned * from the tmcrawl macro is then used to change the url of the resulting * data set obtained from tmfilter. * Parameters * dataset = name of dataset to contain the file info, Default is WORK.TEXT * dir * path to the directory containing the files to be filtered * destdir = Location of files to be filtered. This is also the * location to write the html files that are found during the crawl. * host = host name or ip address of machine that the macro is running on. * Defaults to localhost * * url = url either in form http//:www.url.com or just www.url.com. No default. * If this is not specified xcrawl does not run; * depth = number of levels to crawl. default is 2; visit all the links * on the specified page and then all their links, and then stop. * links = name of the links data set to be created. Default is WORK.LINKS. * This only occurs if url is specified. * norestrict= If this is set to anything, it will travel outside domain. Blank * means only process within the given domain. * username = The username for accessing secured website during crawl. * password = The user's password for accessing a secured website during a crawl * maxsize = number of bytes of the largest file to be processed max is 10455760 * time = time in seconds to wait for filtering a file default is 1000 seconds. * ext = restricts the filter to only process files with extensions listed. * example: ext= html htm txt . pdf doc * ( the "." means files that have no extension) * manual = indicates if the cfs service has been started manually. * * Returns: * * History: * 20Nov01 Coding [rualbr] * 17Jan03 Converted to work with 9.1 Text Miner and to call %tmcrawl [cox] * 06Apr03 Added check to ensure languages were spelled correctly * 15Mar04 Increased format size of URI variable, * 15Mar04 Write config file out to work dir * 15Mar04 Read in the log file to the sas log window. * Notes: * End * ************************************************************** */ %macro tmfilter( dataset=work.data, dir= ,destdir= , port=0, host=localhost, numchars=0,numbytes=0, language=english, url= , depth=2, norestrict=, links=work._links, username = ,password = ,time=1000,maxsize=504557600, ext=,manual=, transcode=); %local tempPath; %local workdir; %local myRoot; %local programDir; %local cfsDir; %local _ERRMSG1; %local _ERRMSG2; %local _ERRMSG2a; %local _WARNMSG; %local _sync; %local _notes; %local _source; %local _ELangError_; %local _badLanguage_; %global _TMCRAWL; %local _catNString; %local _fwdString; %local _valueString; %local _endString; %local _cfmString; %local _cfsvString; %local _filtersDString; %local _filtersTString; %local _cfsIString; %local _cfsFString; %local _csmtcpString; %local _csmthString; %local _csmipString; %local _readLogSuccess; %local _proceed; /*indicator variable for proceeding*/ %local _mylog; %local _ranName; %global _EFIERR_; %global _EFIREC_; %let _readLogSuccess = 0; %let _ERRMSG1=ERROR: xfilter.exe failed: See log below.; %let _ERRMSG2=ERROR: xfilter.exe failed, check ; %let _ERRMSG2a= for details.; /* used to indidicate if language list is valid or not*/ %let _ELangError_ = 0; /* %let _ERRMSG2a= to make sure it exists and can be executed.;*/ %let _WARNMSG=WARNING: Could not delete tmfilter.fin file.; %let _proceed=1; /* generate a random naeme*/ /*%let _ranName=%sysfunc(putn(%sysfunc(int(1e8*%sysfunc(ranuni(0)))),z8.));*/ %if &sysscp NE WIN %then %do; %put ERROR: This macro only runs on the Windows platform.; %goto exit; %end; %let tempPath= c:/temp; %let _EFIERR_ = 0; /* set the ERROR detection macro variable */ %let _EFIREC_ = 0; /* clear export record count macro variable */ /* Directory where the sas WORK temp directory is*/ %LET workdir = %SYSFUNC(PATHNAME(work)); %let myRoot = %SYSGET(sasroot); /* SAS Root Directory*/ /* directory containing xcrawl and xfilter*/ %let programDir= &myRoot.\tmine\sasexe; %let cfsDir= &myRoot.\tmine\sasmisc; data _null_; do; call symput('sasLocEnc_',upcase(getlocenc())); select(upcase(getlocenc())); when('WLATIN1','LATIN1') call symput('localEnc_','windows-1252'); when('SHIFT-JIS') call symput('localEnc_','shift_jis'); when('EUC-KR') call symput('localEnc_', 'euc-kr'); when('EUC-JP') call symput('localEnc_', 'euc-jp'); when('EUC-CN') call symput('localEnc_', 'euc-cn'); when('MS-950') call symput('localEnc_', 'Big5'); when('UTF-8') call symput('localEnc_', 'UTF-8'); otherwise call symput('localEnc_', 'windows-1252'); end; end; run; %if %bquote(&sasLocEnc_) = WLATIN1 %then %do; %let sasLocEnc_ = LATIN1; %end; %if &numchars EQ 0 AND &numbytes EQ 0 %then %do; %let numbytes=60; %PUT NOTE: The default of numbytes=60 is being used.; %end; /* make numchars be numbytes*/ %if &numchars> 0 AND &numbytes > 0 %then %do; %PUT NOTE: Both numchars and numbytes were specified. Only numbytes will be used.; %end; %else %do; %if &numchars> 0 %then %do; %let numbytes=&numchars; %end; %end; %if((%bquote(&manual) NE )AND (&port EQ 0 ))%then %do; %PUT ERROR: The manual option requires a specified port number; %end; /* If url specified, then run %tmcrawl() */ %if (%bquote(&url) NE AND &_proceed =1) %then %do; %tmcrawl(url=&url, destdir=&dir, depth=&depth, links=&links, names=work._names, norestrict=&norestrict, username = &username, password=&password); %if &_TMCRAWL EQ -1 %then %let _proceed=-1; %end; %if &_proceed EQ 1 %then %do; filename _fname "&workdir.\tmfilter.fin"; %if %sysfunc( fdelete(_fname)) %then %do; /* save the options settings*/ data work._optDS; set SASHELP.VALLOPT(where=(optname in ('SOURCE', 'XSYNC', 'NOTES')) keep=optname setting); run; %end; data _NULL_; set work._optDS; if _N_ EQ 1 then call symput('_source',setting); if _N_ EQ 2 then call symput('_sync',setting); if _N_ EQ 3 then call symput('_notes',setting); run; /* as long a directory is specified we can run xfilter over the directory*/ options XSYNC; %if %nrbquote(&dir) NE %then %do; /* check langauge list*/ Data _NULL_; length parsedlang $200; length concatlang $200; concatlang = symget('language'); do n=1 to 16; parsedlang = scan(concatlang,n,' '); parsedlang = trim(left(parsedlang)); if lengthn(parsedlang) >0 then do; if parsedlang not in ( "english","french","german", "spanish","portuguese","danish","dutch", "simplified-chinese", "traditional-chinese", "japanese", "korean", "bokmal", "nynorsk", /* CCJK and Norwegian(Bokmal&Nynorsk), added by Rodger */ "finnish","italian","swedish") then do; call symput('_ELangError_', '1'); call symput('_badLanguage_',trim(parsedlang)||' is not valid.'); end; output; end; end; run; %if &_ELangError_ = 1 %then %do; %put %bquote('ERROR: The language &_badlanguage_'); %end; %else %do; /* write config file*/ DATA _NULL_; file "&workdir.\filterconfig.txt"; do; EFIOUT + 1; put "-log"; put "&workdir.\filterLog.txt"; put "-cfs"; put "&cfsDir"; put "-dataset"; put "&workdir.\data.txt"; put "-host"; put "&host"; put "-numbytes"; put "&numbytes"; put "-language"; put "&language"; put "-dir"; put "&dir"; put "-port"; put "&port"; put "-time"; put "&time"; put "-langdir"; put "&cfsdir"; put "-maxsize"; put "&maxsize"; %if %bquote(&ext) NE %then %do; put "-ext"; put "&ext"; %end; %if %bquote(&manual) NE %then %do; put "-manual"; %end; put "-encoding"; select(upcase(getlocenc())); when('WLATIN1','LATIN1') put "cp_1252"; when('SHIFT-JIS') put "shift_jis"; when('EUC-KR') put "euc_kr"; when('EUC-JP') put "euc_jp"; when('EUC-CN') put "euc_cn"; when('MS-950') put "big5"; when('UTF-8') put "utf_8"; when('MS-949') put "euc_kr"; otherwise put "cp_1252"; end; %if &transcode NE %then %do; put "-oencoding"; select(upcase(getlocenc())); when('WLATIN1','LATIN1') put "cp_1252"; when('SHIFT-JIS') put "shift_jis"; when('EUC-KR') put "euc_kr"; when('EUC-JP') put "euc_jp"; when('EUC-CN') put "euc_cn"; when('MS-950') put "big5"; when('UTF-8') put "utf_8"; when('MS-949') put "euc_kr"; otherwise put "cp_1252"; end; %end; put "-options"; put "&workdir.\_cfsoptions.txt"; %if %nrbquote(&destdir) NE %then %do; put "-destdir"; put "&destdir"; %end; end; RUN; %let _catNString = %bquote(); %let _endString2 = %str(%'?>); %let _cfmString = %bquote("CatLogFileManager.FilePath"); %let _cfsvString = %bquote("CFS.VerboseLog"); %let _filtersTString =%bquote("Filters.Timeout"); %let _cfsIString =%bquote("CFS.InputEncoding"); %let _cfsFString =%bquote("CFS.FilterASCII"); %let _csmtcpString =%bquote("CatSocketMgr.TCPPort"); %let _csmthString =%bquote("CatSocketMgr.ThreadCount"); %let _csmipString =%bquote("CatSocketMgr.IPSecurity"); %let _changecwdString =%bquote("CFS.ChangeCWD"); %let _maxSizeString=%bquote("Filters.maxSize"); %let _filterAsciString=%bquote("CFS.FilterASCII"); %let _includeHeadString=%bquote("CFS.IncludeHeader"); %let _includeFootString=%bquote("CFS.IncludeFooter"); %let _xmlstring= %bquote("; put "&_catNString.&_cfmString. &_valueString.&workdir.\_cfsLog.txt&_endString"; put "&_catNString.&_cfsvString. &_valueString.false&_endString"; put "&_catNString.&_filtersTString. &_valueString.120&_endString"; put "&_catNString.&_cfsIString. &_valueString.iso-8859-1&_endString"; put "&_catNString.&_cfsFString. &_valueString.false&_endString"; put "&_catNString.&_csmtcpString. &_valueString.22220&_endString"; put "&_catNString.&_csmthString. &_valueString.5&_endString"; put "&_catNString.&_csmipString. &_valueString.*&_endString"; put "&_catNString.&_changecwdString. &_valueString.false&_endString"; put "&_catNString.&_filterAsciString. &_valueString.false&_endString"; put "&_catNString.&_maxSizeString. &_valueString.504557600&_endString"; put "&_catNString.&_includeHeadString. &_valueString.false&_endString"; put "&_catNString.&_includeFootString. &_valueString.false&_endString"; put ""; end; run; DATA _NULL_; file "&workdir.\tmfilter.bat"; do; EFIOUT + 1; put "CD /D &programDir"; put " xfilter -config &workdir.\filterconfig.txt"; put "exit"; end; RUN; x "%bquote("&workdir.\tmfilter.bat")"; /* the sleep is needed to give the os time to close the file and remove the lock on the infile below. Otherwise "file in use error".*/ DATA _NULL_; call sleep(10,.1); run; %end; /* end if _ELANGERROR_ != 1*/ %if %sysfunc(fexist(_fname)) %then %do; %let _mylog = &workdir.\filterLog.txt; options nosource nonotes; %if %sysfunc(fileexist(&_mylog)) %then %do; filename filrf "&_mylog"; data _NULL_; infile filrf truncover lrecl=200; input c $200.; put c; run; %end; OPTIONS &_source &_sync &_notes; /* create the datasets*/ data &dataset(compress=yes); %let _EFIERR_ = 0; /* set the ERROR detection macro variable */ infile "&workdir./data.txt" delimiter='09'x MISSOVER DSD lrecl=60000 firstobs=2 ; /* maximum record length*/ length TEXT $ &numbytes; informat TEXT $KTRUNC&numbytes..; informat URI $KTRUNC2048. ; informat NAME $KTRUNC255. ; informat FILTERED $KTRUNC255. ; informat TRUNCATED best32. ; informat OMITTED best32. ; informat LANGUAGE $KTRUNC20.; informat EXTENSION $KTRUNC20. ; informat CREATED DATETIME. ; informat ACCESSED DATETIME. ; informat MODIFIED DATETIME. ; informat SIZE best12. ; format TEXT $KTRUNC&numbytes..; format URI $2048. ; format NAME $255. ; format FILTERED $255. ; format TRUNCATED best12. ; format OMITTED best12. ; format LANGUAGE $20.; format EXTENSION $20. ; format CREATED DATETIME. ; format ACCESSED DATETIME. ; format MODIFIED DATETIME. ; format SIZE best12. ; input TEXT $ URI $ NAME $ filtered $ TRUNCATED OMITTED LANGUAGE $ EXTENSION CREATED ACCESSED MODIFIED SIZE ; /* set ERROR detection macro variable */ if _ERROR_ then call symput('_EFIERR_',1); run; /* If we ran %tmcrawl and work.names exists, need to change the uri to correspond to the source location. */ %if %bquote(&url) NE and %sysfunc(exist(work._names)) %then %do; data work._names; length uri $2048; set work._names; uri = "file://" || _filename; run; proc sort; by uri; run; proc sort data=&dataset; by uri; run; data &dataset (drop=_filename _url); merge &dataset work._names(in=a); by uri; if a; uri = _url; run; %end; /* reset the variable lengths to the length of the largest string for all character variables */ %global _tmMaxLenText; %global _tmMaxLenURI; %global _tmMaxLenName; %global _tmMaxLenFiltered; %global _tmMaxLenLanguage; %let _tmMaxLenText=0; %let _tmMaxLenURI=0; %let _tmMaxLenName=0; %let _tmMaxLenFiltered=0; %let _tmMaxLenLanguage=0; data _null_ ; set &dataset end=eof; drop maxLenText maxLenURI maxLenName maxLenFiltered maxLenLanguage; retain maxLenText 0 maxLenURI 0 maxLenName 0 maxLenFiltered 0 maxLenLanguage 0; maxLenText=max(maxLenText,length(trim(left(text)))) ; /* special case 0 length of text*/ if maxLenText=0 then maxLenText=1; maxLenURI=max(maxLenURI,length(trim(left(uri)))) ; maxLenName=max(maxLenName,length(trim(left(name)))) ; maxLenFiltered=max(maxLenFiltered,length(trim(left(filtered)))) ; maxLenLanguage=max(maxLenLanguage,length(trim(left(language)))) ; if eof then do; call symput("_tmMaxLenText",put(maxLenText,6.)) ; call symput("_tmMaxLenURI",put(maxLenURI,6.)) ; call symput("_tmMaxLenName",put(maxLenName,6.)) ; call symput("_tmMaxLenFiltered",put(maxLenFiltered,6.)) ; call symput("_tmMaxLenLanguage",put(maxLenLanguage,6.)) ; end; run ; data &dataset(compress=yes); length TEXT $ &_tmMaxLenText ; length uri $ &_tmMaxLenUri ; length NAME $ &_tmMaxLenName ; length FILTERED $ &_tmMaxLenFiltered ; length LANGUAGE $ &_tmMaxLenLanguage ; set &dataset ; if truncated = 1 or truncated =0 then output; run ; %if &_tmMaxLenFiltered < 2 %then %do; data &dataset(compress=yes); set &dataset(drop=FILTERED); run; %end; proc freq data=&dataset; tables language; tables truncated*omitted; run; %if %sysfunc( fdelete(_fname)) %then %put %bquote(&_WARNMSG); DATA _NULL_; call sleep(10,.1); run; %end;/* end to this line: %if %sysfunc(fexist(_fname)) %then %do;*/ %else %do; %let _mylog = &workdir.\filterLog.txt; options nosource nonotes; %if %sysfunc(fileexist(&_mylog)) %then %do; filename filrf "&_mylog"; data _NULL_; infile filrf truncover lrecl=200; input c $200.; put c; run; %end; OPTIONS &_source &_sync &_notes; %put ERROR: The filtering terminated prematurely.; %end; %end; /* If directory specified */ %else %do; %PUT ERROR: The filter did not execute. Check your syntax. The DIR option is required.; %end; /* set the option back*/ %end; /* end the proceed*/ %exit: %mend tmfilter;