/* **************************************************************** * Name: tmcrawl.sas * Support: Jim Cox * * Usage: %tmcrawl(url="", destdir= , * [depth=,] [links=,] * [names=,] [norestrict=] [username =][passsword=]); * * Purpose: To crawl a web page and any of its links, and its links' links, * out to the depth specified. * * Parms: url - the url to start at (e.g. http://www.sas.com) * destdir - the location to put all the files found by the crawler. * depth - the number of links to traverse from that starting location. * Defaults to 2. * links - the name of the SAS data set that shows all the links between * files in that directory. Defaults to work.links. * names - the name of the SAS data set that shows the conversion * from url to the corresponding filename in destdir. * files in that directory. Defaults to work.links. * norestrict - if specified as non-blank then go outside domain). * Returns: * * History: * 16Jan03 Initial Coding [cox] * 08Mar03 Added password and username [rualbr] * * Last Modified By: * Last Modified On: Wed Feb 19 15:53:06 2003 * Notes: * End * ************************************************************** */ %macro tmcrawl (url= , destdir= ,depth=2, norestrict=, links=work._links, names=work._names,username=, password=); %local _OLDXSYNC; %local _WORKDIR; %local _MYROOT; %local _PROGRAMDIR; %local _CFSDIR; %local _ERRMSG1; %local _ERRMSG2; %local _ERRMSG3; %local _ERRMSG3a; %local _WARNMSG3; %local _WARNMSG4; %local _notes; %local _source; %local _xsync; %local _NAMEFILE; %let _OMITTED_MSG= Omitted urls; %let _WARNMSG2=WARNING: Could not delete omitted.txt file; %let _WARNMSG3=WARNING: Could not delete names.txt file; %let _WARNMSG4=WARNING: Could not delete links.txt file; %let _ERRMSG2=ERROR: Xcrawl.exe failed: See reason above.; %let _ERRMSG1=ERROR: You must specify both a url (using url=) and a directory (using dir=) on the tmfilter call.; %let _ERRMSG3=ERROR: xcrawl.exe unable to be run, check ; %let _ERRMSG3a= to make sure it exists and can be executed.; %let _TMCRAWL = 0; %let _URLLEN=2048; /* save the options settings*/ data work._coptDS; set SASHELP.VALLOPT(where=(optname in ('SOURCE', 'XSYNC', 'NOTES')) keep=optname setting); run; data _NULL_; set work._coptDS; if _N_ EQ 1 then call symput('_source',setting); if _N_ EQ 2 then call symput('_xsync',setting); if _N_ EQ 3 then call symput('_notes',setting); run; /* %let _OLDXSYNC=%sysfunc(optgetn(XSYNC)); */ /* Directory where the sas WORK temp directory is*/ %LET _WORKDIR = %SYSFUNC(PATHNAME(work)); %let _MYROOT = %SYSGET(sasroot); /* SAS Root Directory*/ /* directory containing the xcrawl and xfilter*/ %let _PROGRAMDIR= &_MYROOT.\tmine\sasexe; %let _CFSDIR= &_MYROOT.\tmine\sasmisc; /* URL and DESTDIR must both be specified */ %if %bquote(&url)= or %bquote(&destdir)= %then %do; %put %bquote(&_ERRMSG1); %let _TMCRAWL = -1; %end; %else %do; /* If XSYNC option is off, then need to make sure it is on during execution of the external command */ OPTIONS XSYNC; filename _fname "&destdir.\names.txt"; filename _fname2 "&destdir.\links.txt"; filename _fname3 "&destdir./omitted.txt"; %if %sysfunc( fdelete(_fname)) %then; %if %sysfunc( fdelete(_fname2)) %then; %if %sysfunc( fdelete(_fname3)) %then; options nosource nonotes; /* write config file*/ DATA _NULL_; length url $&_URLLEN; file "&_WORKDIR.\crawlconfig.txt" lrecl=2000; do; EFIOUT + 1; put "-log"; put "&_WORKDIR.\crawlLog.txt"; put "-url"; put "&url"; put "-depth"; put "&depth"; put "-destdir"; put "&destdir"; %if &norestrict= %then %do ; put "-domain" ; %end; %if &username NE %then %do; put "-username"; put "&username"; put "-password"; put "&password" ; %end; end; RUN; DATA _NULL_; file "&_WORKDIR.\tmcrawl.bat"; put "CD /D &_PROGRAMDIR"; length programOptions $500; programOptions = "xcrawl -config &_WORKDIR.\crawlconfig.txt"; put programOptions; put "exit"; RUN; x %bquote("&_WORKDIR.\tmcrawl.bat"); /* Now display log from xcrawl in log window */ /* %put &_WORKDIR.\crawlLog.txt. */ %if %sysfunc(fileexist("&_WORKDIR.\crawlLog.txt")) %then %do; DATA _NULL_; infile "&_WORKDIR./crawlLog.txt" truncover; input line $100.; put line; data _null_; length url $&_URLLEN; length reason $255; infile _fname3 missover dsd lrecl=32767 delimiter='09'x firstobs=2; input index url reason $; if _n_ = 1 then do; put; put; put "&_OMITTED_MSG"; end; put index ": " url " - " reason; run; /* delete the file omitted.txt */ %if %sysfunc( fdelete(_fname3)) %then %put %bquote(&_WARNMSG2); /* create a data set out of the names file that xcrawl created and then delete it from the folder that will be processed with xfilter.*/ %if %sysfunc(fexist(_fname)) %then %do; data &names; infile _fname delimiter='09'x TRUNCOVER DSD lrecl=32767 firstobs=2; length _filename _url $&_URLLEN ; input INDEX _FILENAME $ _URL $ ; if _ERROR_ then call symput('_EFIERR_',1); run; /* delete the file names.txt*/ %if %sysfunc( fdelete(_fname)) %then %put %bquote(&_WARNMSG3); /* Now do the same with the links file, create a links data set from that */ %if %sysfunc(fexist(_fname2)) %then %do; PROC IMPORT OUT= &links DATAFILE= "&destdir./links.txt" DBMS=TAB REPLACE; GETNAMES=YES; DATAROW=2; RUN; /* delete the file names.txt*/ %if %sysfunc( fdelete(_fname2)) %then %put %bquote(&_WARNMSG4); %end; %else %do; %put %bquote(&_ERRMSG2); %let _TMCRAWL = -1; %end; %end; %else %do; %put %bquote(&_ERRMSG2); %let _TMCRAWL = -1; %end; %end; %else %do; %put %bquote(&_ERRMSG3) %bquote("&_PROGRAMDIR.\xcrawl.exe") %bquote(&_ERRMSG3a); %let _TMCRAWL = -1; %end; %end; /* Deassign _fname and _fname2 filerefs */ filename _fname; filename _fname2; /* Now reset the XSYNC option if necessary */ /* set the options back*/ OPTIONS &_source &_xsync &_notes; %mend; /* Example usage */ /* %tmcrawl(url=http://www.sas.com, destdir=c:\sasv9\dmine\sasmisc\textmine\lang\filtres,depth=2); */ /* */