OBSERVATIONS ARTICLE CODE: Online issue #15


         
          /******************************************************************/
          /*       Appendix B - SAS Code files for Obswww15 - Patridge      */
          /*                                                                */
          /*                                                                */
          /*     'Source Listings of SAS Programs/Macros for Fuzzy Match'   */  
          /*                          SOURCE LISTING                        */
          /*             FILE REF = PRTFUZZ AND FILE TYPES = SAS            */
          /*                                                                */
          /* -------------------- FILEREF=PRTFUZZ -----------------------   */
          /*                                                                */
          /*                       OBS      SRCCODE                         */
          /*                                                                */
          /*                        1     DROPCHAR.SAS                      */
          /*                        2     EMPTYYN.SAS                       */
          /*                        3     FUZZY.SAS                         */
          /*                        4     NORECDS.SAS                       */
          /*                        5     READSTD.SAS                       */
          /*                        6     REMOVEDB.SAS                      */
          /*                        7     SNDSLIKE.SAS                      */
          /*                        8     STDLIST.SAS                       */
          /*                        9     TOKENIZE.SAS                      */
          /******************************************************************/
              
                                    
                                     
                         
                                                                              
   /***************************************************************/          
   /*** MACRO DROPCHAR.SAS                                      ***/          
   /***                                                         ***/          
   /***  Delete a string of characters from the end of any word ***/          
   /***************************************************************/          
                                                                              
   %macro dropchar( string, char );                                           
    zzcnt = 0;                                                                
    zzl = length( "&char" );                                                  
    do zzw = 1 to 100;/*** how many words are there in string ***/            
       if ' ' ne scan( &string, zzw) then zzcnt + 1;                          
    end;                                                                      
    zzw = 0;                                                                  
    length zzwordo $ 200 zzword $ 200 ;                                       
    do zzw = 1 to zzcnt; /*** loop for as many words as found ***/            
     zzwordo = scan(&string,zzw); /*** keep original word ***/                
     zzword  = zzwordo;           /*** new word to be modified ***/           
       /*** check the last char of the word and set to blank ***/             
       zzword = left(reverse(zzword));                                        
       if substr(zzword,1,zzl)=reverse("&char") then substr(zzword,1,zzl)=' ';
       zzword = left(reverse(zzword));                                        
     zzword  = compress(zzword, ' '); /*** remove all internal blanks ***/    
               /*** replace original word with new word ***/                  
               /*** and remove multiple blanks ***/                           
     &string = compbl (tranwrd( &string,trim(zzwordo),trim(zzword)));         
    end;                                                                      
     drop zzl zzw zzcnt zzwordo zzword; /*** drop these vars ***/             
   %mend dropchar;                                                            

        
 
                                                                           
     *********************************************************************;
     **  PROGRAM: EMPTYYN  (MACRO)                                        ;
     **  AUTHOR: CHUCK PATRIDGE                                           ;
     **  DATE: 10/12/94                                                   ;
     **  PURPOSE: DETERMINE IF A SAS DATASET IS EMPTY.                    ;
     **  INPUT PARAMETER: DSNAME (NAME OF SAS DATA SET)                   ;
     **                   ALSO CAN BE BLANK ( USE LAST DATASET CREATED)   ;
     **  OUTPUT: WILL CREATE THE FOLLOWING GLOBAL MACRO VARIABLES         ;
     **            EMPPTYYN               Y=EMPTY, N=NONEMPTY             ;
     **            NUMOBS             WHICH PROVIDES NUMBER OF RECORDS    ;
     **            DSN                WHICH PROVIDES NAME OF DATA SET     ;
     ** SAMPLE CALL: %EMPTYYN(DATASET)                                    ;
     **                                                                   ;
     *********************************************************************;
                                                                           
     %GLOBAL EMPTYYN NUMOBS DSN;                                           
     %MACRO EMPTYYN(DSNAME);                                               
       DATA _NULL_;                                                        
        IF "&DSNAME " = " " THEN CALL SYMPUT('DSNAME','_LAST_');           
       RUN;                                                                
                                                                           
       DATA _NULL_;                                                        
        IF 0 THEN SET &DSNAME NOBS=NUMOBS;                                 
        IF NUMOBS > 0 THEN EMPTYYN = 'N';                                  
                      ELSE EMPTYYN = 'Y';                                  
        CALL SYMPUT('EMPTYYN',PUT(EMPTYYN, $1.));                          
        CALL SYMPUT('NUMOBS' ,PUT(NUMOBS , BEST.));                        
        CALL SYMPUT("DSN"    ,PUT("&DSNAME" , $VARYING17.));               
       RUN;                                                                
      %MEND EMPTYYN; RUN;                                                  
     ********  END OF Program - Emptyyn ***********;                       

   
                                                                                                          
    /*******************************************************************/                                 
    /***fuzzy.sas                                                    ***/                                 
    /***                                                             ***/                                 
    /***  Author: Charles Patridge                                   ***/                                 
    /***          PDPC, Ltd.                                         ***/                                 
    /***          172 Monce Road                                     ***/                                 
    /***          Burlington, CT 06013                               ***/                                 
    /***          Home: 860-673-9278 or 860-675-9026                 ***/                                 
    /***          Email: TVJB41A@prodigy.com                         ***/                                 
    /***          Email: Charles_S_Patridge@prodigy.com              ***/                                 
    /***                                                             ***/                                 
    /***  Copyrighted 1984, 1989, 1993, 1994, 1996, 1998             ***/                                 
    /***  With Author's permission, you may use this program         ***/                                 
    /***  as agreed upon by Author.  It is illegal to distribute     ***/                                 
    /***  this program for profit or misrepresent the original       ***/                                 
    /***  owner/author of this Fuzzy Merge Routine(s).               ***/                                 
    /***                                                             ***/                                 
    /***  Match potential records between two files , duplicates     ***/                                 
    /***  After Files have been standardized Names, Addresses, State ***/                                 
    /***  Find number of words found in Name and Address fields      ***/                                 
    /***  Find Pct of words over all words                           ***/                                 
    /***                                                             ***/                                 
    /***  Ignore finding a record against itself (dup macro var)     ***/                                 
    /***  Search Only records that have the same 1st "3"             ***/                                 
    /***    characters of zipcode (USA zip codes)                    ***/                                 
    /***    and have same gender and birthdate                       ***/                                 
    /***                                                             ***/                                 
    /***  To Determine Duplicates, First Name or Last Name           ***/                                 
    /***    must be matched (order of appearance makes no            ***/                                 
    /***    difference).  Initials will throw off a match.           ***/                                 
    /***                                                             ***/                                 
    /***  This routines employs the use of the soundex function      ***/                                 
    /***    to compensate for like sounding names.                   ***/                                 
    /***                                                             ***/                                 
    /*******************************************************************/                                 
    Libname  merge  "C:\download\merge"; run;                                                             
    filename macros "C:\download\merge"; run;                                                             
    %include macros( 'emptyyn.sas'  ); /*** Determine NOBS of a DATASET ***/                              
    %include macros( 'sndslike.sas' ); /*** Soundex Function Macro ***/                                   
    %include macros( 'norecds.sas'  ); /*** display no dups found notification ***/                       
                                                                                                          
    %let trnxs   = transact ; /***Transaction Data File - after standardization ***/                      
    %let master  = master   ; /***Master Data File - after standardization ***/                           
    %let dup     = within   ; /***if checking for duplicates and Master/Transact are same File ***/       
   ****%let dup     = external ; /*** if checking for matches and Master/Transact are different Files ***/
                                                                                                          
    %let ziparea = 3; /*** How much of a Zip Area to search duplicates ***/                               
                      /*** smaller # means larger area, larger # means smaller area ***/                  
                      /*** ranges from 1 to 5 ***/                                                        
                                                                                                          
    %let maxword = 20; /*** maximum number of words in a given sas variable ***/                          
                                                                                                        
    data merge.&trnxs; /*** Transaction File ***/                                                         
      set merge.&trnxs;                                                                                   
      recdno = _n_; /*** get record position - used as an identifier for future use ***/                  
    run;                                                                                                  
                                                                                                          
    data merge.&master; /*** Master File ***/                                                             
     set merge.&master;                                                                                   
     recdno = _n_;  /*** get record position - used as an identifier for future use ***/                  
    run;                                                                                                  
                                                                                                          
    %emptyyn( merge.&trnxs ); /*** how many transaction records ***/                                      
    %let notnxs = &numobs ; /*** set notnxs to number of trnxs records ***/                               
                                                                                                          
    proc datasets library=work; delete matches; run; /*** start with empty matches file ***/              
                                                                                                          
    %macro match;                                                                                         
      %do loopk = 1 %to ¬nxs ; /*** Loop thru Transaction File record by record ***/                   
       data _null_;                                                                                       
        recd = &loopk ;                                                                                   
        set merge.&trnxs point=recd; /*** get record by physical pointer ***/                             
        call symput( 'zipcode' , substr( zipcode, 1, &ziparea) ); /*** get partial of zipcode ***/        
        stop;                                                                                             
       run;                                                                                               
                                                                                                          
       /*** Get Master File Records where Zipcode is similar to Transaction Record ***/                   
       data srchfile(drop=name street city state zipcode recdno);                                         
        set merge.&master(where=(substr(zipcode,1,&ziparea)="&zipcode") );                                
        _recdno = recdno; /*** use as an identifier ***/                                                  
                       /*** re-assign variable names so as to be able to match words ***/                 
        _name   = name;                                                                                   
        _street = street;                                                                                 
        _city   = city;                                                                                   
        _state  = state;                                                                                  
        _zipcode= zipcode;                                                                                
       run;                                                                                               
                                                                                                          
       %emptyyn( work.srchfile );                                                                         
       %let nosrch = &numobs ;                                                                            
                                                                                                          
       data selrecds(drop=ss nn nss);                                                                     
        length word $ 200. ;                                                                              
        retain checkstr;                                                                                  
        recd = &loopk ;                                                                                   
        set merge.&trnxs point=recd;                                                                      
        ss      = 0;                                                                                      
             Do ss = 1 to &nosrch ;                                                                       
                checkstr = 0;                                                                             
                recdss = ss ;                                                                             
                set srchfile point=recdss;                                                                
                if "&dup" = 'within' and recdno = _recdno then goto skiprecd; /*** same record ***/       
                do nn = 1 to &maxword;                                                                    

                   /*** try to match part of name   ***/                                                  
                   noname  = 0;                                                                           
                   noname2 = 0;                                                                           
                   nostre  = 0;                                                                           
                   word = scan( name  , nn );                                                             
                   if word eq ' ' then goto skiprecd; /*** no more words to process ***/                  
                   if (word ne ' ')            and                                                        
                      (length(trim(word)) > 1) and                                                        
                      0 ne index( _name  , trim(word) ) then noname=1;                                    
                   /*** check to see if first or last name match ***/                                     
                     if nn in (1 2) and noname = 1 then do;                                               
                        if nn = 1 then word2nd = scan( name, nn+1);                                       
                        if nn = 2 then word2nd = scan( name, nn-1);                                       
                        if (word2nd ne ' ')      and                                                      
                           (length(trim(word2nd)) > 1) and                                                
                           0 ne index( _name  , trim(word2nd) ) then noname2=1;                           
                        if noname  = 1 or  noname2 = 1 then checkstr + 1;                                 
                     end;                                                                                 
                   if noname  = 1 then output;                                                            
                                                                                                          
                   noname = 0; /*** force vowel processing ***/                                           
                   /*** if no hit, squeeze vowels and try again ***/                                      
                   if noname = 0 then do;                                                                 
                     word = compress( word, 'AEIOUY');                                                    
                     if (word ne ' ')            and                                                      
                        (length(trim(word)) > 1) and                                                      
                        0 ne index( compress(_name, 'AEIOUY')  , trim(word) ) then noname=1;              
                                                                                                          
                     /*** Now use the SoundEx function as last resort ***/                                
                        if noname = 0 then do;                                                            
                          likeword = scan( name, nn );                                                    
                          word = trim(likeword) || '*';                                                   
                          do inn = 1 to &maxword; /*** use soundex function if vowels failed ***/         
                             likeness = scan( _name, inn);                                                
                             if likeness = ' ' then goto skipsnds;                                        
                            if likeword ne ' ' and likeness ne ' ' then do;                               
                               %sndslike( likeword, likeness );                                           
                            end;                                                                          
                          end;                                                                            
                         skipsnds:;                                                                       
                        end;                                                                              
                                                                                                          
                   /*** check to see if first or last name match ***/                                     
                     if nn in (1 2) and noname > 0 then do;                                               
                        word2nd = compress( word2nd, 'AEIOUY');                                           
                        if (word2nd ne ' ')      and                                                      
                           (length(trim(word2nd)) > 1) and                                                
                           0 ne index( compress(_name, 'AEIOUY') , trim(word2nd) ) then noname2=1;        
                        if noname  > 0 or  noname2 = 1 then checkstr + 1;                                 
                     end;                                                                                 
                     if noname  > 0 then output;                                                          
                   end;                                                                                   

                 if checkstr > 1 then do; /*** now check street address for possible words ***/           
                   do nss = 1 to &maxword;                                                                
                     nostre = 0;                                                                          
                     noname = 0;                                                                          
                     /*** try to match part of street ***/                                                
                     word = scan( street,  nss);                                                          
                     if word eq ' ' then goto skipstre;                                                   
                     if (word ne ' ')            and                                                      
                        (length(trim(word)) > 1) and                                                      
                        0 ne index( _street, trim(word) ) then nostre=1;                                  
                     if nostre  = 1 then output;                                                          
                                                                                                          
                     /*** if no hit, squeeze vowels and try again ***/                                    
                     if nostre = 0 then do;                                                               
                       word = compress( word, 'AEIOUY');                                                  
                       if (word ne ' ')            and                                                    
                          (length(trim(word)) > 1) and                                                    
                          0 ne index( compress(_street, 'AEIOUY')  , trim(word) ) then nostre=1;          
                       if nostre  = 1 then output;                                                        
                     end;                                                                                 
                     /*** do not bother with SoundEx function on street address ***/                      
                   end;                                                                                   
                 end;                                                                                     
                skipstre:;                                                                                
                end;                                                                                      
              skiprecd:;                                                                                  
             end;                                                                                         
        stop; ;                                                                                           
       run;                                                                                               
       proc append base = matches  data=selrecds; run; /*** add found words to matches ***/               
      %end;                                                                                               
    %mend match;                                                                                          
                                                                                                          
    %match;                                                                                               
                                                                                                          
    proc sort data=matches out=matches;                                                                   
     by recdno _recdno word;                                                                              
    run;                                                                                                  
                                                                                                          
    /***count number of words found for name and street and total words ***/                              
    /***create pct words found and rank records ***/                                                      
    data dups(drop=word i noname nostre checkstr noname2 word2nd likeword inn likeness                    
                          sdf lsdf sds lsds);                                                             
     length matchstr $ 200. ;                                                                             
     retain matchstr;                                                                                     
     set matches;                                                                                         
     by recdno _recdno word;                                                                              
     if first._recdno then matchstr = ' ';                                                                
     if first._recdno then do;                                                                            
        cntname  = 0;                                                                                     
        cntstre  = 0;                                                                                     

     end;                                                                                                 
     /*** build a character string of words used to find possible match ***/                              
     if first.word and word ne ' ' then matchstr = trim( matchstr ) || trim(word) || ',';                 
     cntname + noname ;                                                                                   
     cntstre + nostre ;                                                                                   
     if last._recdno then do; /*** keep only the last duplicate record of master record ***/              
        cntword = 0;                                                                                      
        do i = 1 to &maxword;                                                                             
           if ' ' ne scan( name  , i ) then cntword + 1;                                                  
           if ' ' ne scan( street, i ) then cntword + 1;                                                  
        end;                                                                                              
        pctword = sum(of cntname cntstre) / cntword ; /*** calculate pct words ***/                       
        matchstr = left( substr(right(matchstr),1,200));                                                  
        output;                                                                                           
     end;                                                                                                 
    run;                                                                                                  
                                                                                                          
    proc sort data=dups out=merge.dups;                                                                   
     by recdno descending pctword; /*** rank by descending PCTWORD ***/                                   
    run;                                                                                                  
                                                                                                          
    proc datasets library=work;                                                                           
     delete srchfile selrecds matches dups; /*** delete unwanted datasets ***/                            
    quit;                                                                                                 
                                                                                                          
    /*** Now create a simple report of records found as a possible match ***/                             
    proc printto file="c:\download\merge\dups.lis" new; run;                                              
    options ls=72 nodate center number pageno=1;                                                          
    title1 "Transactions Records that potentially could match Master Records";                            
    title2 "1st 3 lines are from Transaction File";                                                       
    title3 "2nd 3 lines are from Master File";                                                            
    title4 "Last Line are words found that determined possible match";                                    
    title5 "Words with * caught with soundex function";                                                   
                                                                                                          
    %emptyyn( merge.dups );                                                                               
    %norecds; /*** Display Report if No Duplicate Records Exist ***/                                      
                                                                                                          
    proc forms data=merge.dups width=120 lines=9 between=1 pagesize=60;                                   
    line 1   recdno  name; /*** Transaction Record ***/                                                   
    line 2            street / indent=13;                                                                 
    line 3            city  state  zipcode / pack indent=13;                                              
    line 4  _recdno  _name ; /*** Master Record ***/                                                      
    line 5           _street / indent=13;                                                                 
    line 6           _city _state _zipcode / pack indent=13;                                              
    line 7           matchstr ;                                                                           
    line 8 '________________________________';                                                            
    run;                                                                                                  
                                                                                                          
    proc printto; run;                                                                                    
    /*** end of program - fuzzy ***/                                                                      



    /*******************************************************************/                                 
                        /***     MEMBER = NORECDS.SAS   ***/ 

    %macro norecds;                                                    
         %if &numobs = 0 %then %do;                                    
          data _null_;                                                 
           file print;                                                 
           put "Found No Records that could potentially be duplicated";
           put "Hence, there are no records to display at this time";  
         run;                                                          
         %end;                                                         
    %mend norecds;                                                     


    /*******************************************************************/                                 
                        /***     MEMBER = READSTD.SAS   ***/ 


   /*** readstd.sas ***/                                         
   /*** read file with words that need to be eliminated ***/     
                                                                 
    %macro readstd ( fnames );                                   
         filename stdfiles "c:\download\merge\&fnames..csv"; run;
         data &fnames ;                                          
          length orginal $ 200. convert $ 200. ;                 
          infile stdfiles delimiter=',' dsd missover;            
          input  orginal convert ;                               
          orginal = upcase(orginal);                             
          convert = upcase(convert);                             
         run;                                                    
    %mend readstd;                                               
                                                                 
    /*** end of program - readstd ***/                           


                                                                                   
    /***************************************************************/              
    /*** MACRO REMOVEDB.SAS                                      ***/              
    /***                                                         ***/              
    /***  Parameter STRING is the character string you wish to   ***/              
    /***  have doubles removed from.                             ***/              
    /***************************************************************/              
                                                                                   
    %macro removedb( string );                                                     
     zzcnt = 0;                                                                    
     do zzw = 1 to 100;/*** how many words are there in string ***/                
        if ' ' ne scan( &string, zzw) then zzcnt + 1;                              
     end;                                                                          
     zzw = 0;                                                                      
     length zzwordo $ 200 zzword $ 200 ;                                           
     do zzw = 1 to zzcnt; /*** loop for as many words as found ***/                
      zzwordo = scan(&string,zzw); /*** keep original word ***/                    
      zzword  = zzwordo;           /*** new word to be modified ***/               
          do zzl = 1 to length(trim(left(zzword)))-1; /*** loop length of word ***/
           /*** set first occurrence of double letter to blank ***/                
           if substr(zzword ,zzl,1)=substr(zzword ,zzl+1,1) then                   
              substr(zzword ,zzl,1)=' ';                                           
          end;                                                                     
      zzword  = compress(zzword, ' '); /*** remove all internal blanks ***/        
                /*** replace original word with new word ***/                      
                /*** and remove multiple blanks ***/                               
      &string = compbl (tranwrd( &string,trim(zzwordo),trim(zzword)));             
     end;                                                                          
      drop zzl zzw zzcnt zzwordo zzword; /*** drop these vars ***/                 
    %mend removedb;                                                                



    /*******************************************************************/                                 
                      /***     MEMBER = SNDSLIKE.SAS     ***/ 


    %macro sndslike( first, second );                                  
         sdf= soundex(&first);                                         
        lsdf= substr(sdf,2);                                           
         sds= soundex(&second);                                        
        lsds= substr(sds,2);                                           
          if length(trim(lsdf)) > 1 and length(trim(lsds)) > 1 then do;
           if ( 0 ne index(trim(lsdf), trim(lsds))) or                 
              ( 0 ne index(trim(lsds), trim(lsdf))) then noname + 1;   
          end;                                                         
     %mend sndslike;                                                   


   /********************************************************************/                            
    /***stdlist.sas                                                  ***/                            
    /***                                                             ***/                            
    /***  Author: Charles Patridge                                   ***/                            
    /***          PDPC, Ltd.                                         ***/                            
    /***          172 Monce Road                                     ***/                            
    /***          Burlington, CT 06013                               ***/                            
    /***          Home: 860-673-9278 or 860-675-9026                 ***/                            
    /***          Email: TVJB41A@prodigy.com                         ***/                            
    /***          Email: Charles_S_Patridge@prodigy.com              ***/                            
    /***                                                             ***/                            
    /***  Copyrighted 1984, 1989, 1993, 1994, 1996, 1998             ***/                            
    /***  With Author's permission, you may use this program         ***/                            
    /***  as agreed upon by Author.  It is illegal to distribute     ***/                            
    /***  this program for profit or misrepresent the original       ***/                            
    /***  owner/author of this Fuzzy Merge Routine(s).               ***/                            
    /***                                                             ***/                            
    /***  In order to make Fuzzy Match Process Work, standardization ***/                            
    /***  Names and Street Addresses is the critical key to success  ***/                            
    /***  Names are converted to shorten versions where possible     ***/                            
    /***  Addresses are shorten and common words like road, street   ***/                            
    /***   drive, etc. are eliminated from address field to reduce   ***/                            
    /***   the number of possible false hits.                        ***/                            
    /***                                                             ***/                            
    /***  When dealing with Company Names, standardization is also   ***/                            
    /***  critical.  Common Words for Company Names need to be re-   ***/                            
    /***  moved; such COMPANY, LIMITED, ASSOCIATES, etc              ***/                            
    /***  In addition, word endings such as S, ES, IES, ING, TION    ***/                            
    /***  should be dropped using the DROPCHAR routine.              ***/                            
    /***  Also, it might be useful to remove any double letters found***/                            
    /***  within a company name and use REMOVEDB routine to do this  ***/                            
    /***                                                             ***/                            
    /***  Also need to remove special characters and make every      ***/                            
    /***  word UPPER CASE to get exact matching made easier          ***/                            
    /***                                                             ***/                            
    /***  Suggest making each field longer than necessary in order to***/                            
    /***  actually split a word into two words,                      ***/                            
    /***  like HONGKONG set to HONG KONG to increase matching hits   ***/                            
    /*******************************************************************/                            
     libname  merge     "c:\download\merge" ; run;                                                   
     filename macros    "c:\download\merge" ; run;                                                   
                                                                                                     
     %include macros( 'emptyyn.sas' );                                                               
     %include macros( 'dropchar.sas' );                                                              
     %include macros( 'removedb.sas' );                                                              
     %include macros( 'readstd.sas' );                                                               
                                                                                                     
    /*** Read File ***/                                                                              
   %macro stdlist( fname );                                                                          
     filename &fname    "c:\download\merge\&fname..csv"; run;                                        
     DATA &fname;                                                                                    
     length name $200 street $200 city $200 state $200 zipcode $5 ;                                  

     INFILE &fname missover delimiter=',' dsd ;                                                      
     input name street city state zipcode;                                                           
     name   = upcase(name  );                                                                        
     street = upcase(street);                                                                        
     city   = upcase(city  );                                                                        
     *** state  = upcase(state );                                                                    
     state = zipstate( zipcode ); /*** use zipcode to get State Abbreviation ***/                    
     run;                                                                                            
                                                                                                     
     /*** eliminate exact duplicate records of names using NODUPKEY ***/                             
     /*** no sense in processing matching process for 2 or more identical records ***/               
     PROC SORT DATA=&fname OUT=&fname NODUPKEY; by name street zipcode; run;                         
                                                                                                     
     data &fname(drop=i spechars);                                                                   
      set &fname;                                                                                    
     spechars = ',!@#$%^&*()_+-={}[]:;"|\<>.?/~`' || "'";                                            
                                                                                                     
     do i = 1 to length(spechars);                                                                   
        name    = translate(name   , ' ' , substr(spechars,i) ); /*** make spec chars blank ***/     
        street  = translate(street , ' ' , substr(spechars,i) ); /*** make spec chars blank ***/     
     end;                                                                                            
                                                                                                     
     name      = compbl( name      );  /*** remove multiple internal blanks ***/                     
     street    = compbl( street    );  /*** remove multiple internal blanks ***/                     
     run;                                                                                            
                                                                                                     
    %emptyyn( &fname ); /*** how many rawdata records ***/                                           
    %let notnxs = &numobs ; /*** set notnxs to number of records in file ***/                        
                                                                                                     
    %readstd( stdname ); /*** Read Standardized Name File ***/                                       
    %emptyyn( stdname ); /*** How many Standardized Names are there ***/                             
    %let noname = &numobs ; /*** set noname to number of standardized name records ***/              
                                                                                                     
    %readstd( stdaddr ); /*** Read Standardized Address File ***/                                    
    %emptyyn( stdaddr ); /*** How many Standardized Addresses are there ***/                         
    %let noaddr = &numobs ; /*** set noaddr to number of standardized address records ***/           
                                                                                                     
    /*** if dealing with company names, then add code to process standardized company names ***/     
    /*** use stdbusn file to do this ***/                                                            
                                                                                                     
    %macro stdized;                                                                                  
                     /*** orginal will be made into what is in convert ***/                          
       data merge.&fname(drop=in is orginal convert );                                               
        set &fname;                                                                                  
               in = 0;                                                                               
           %do in = 1 %to &noname;                                                                   
             recd = &in ;                                                                            
             set stdname point=recd;                                                                 
             name = ' ' || trim(name); /*** add a blank to front of word ***/                        
             name = left( tranwrd(name ,' ' || trim(orginal) || ' ' ,' '||trim(convert)||' ' ));     
             name = compress( name    , '_' ); /*** _ means removes word ***/                        
             name = compbl ( name );  /*** remove multiple adjacent blanks ***/                      
           %end;                                                                                     
                                                                                                     
               is = 0;                                                                               
           %do is = 1 %to &noaddr;                                                                   
             recd = &is ;                                                                            
             set stdaddr point=recd;                                                                 
             street= ' ' || trim(street); /*** add a blank to front of word ***/                     
             street= left( tranwrd(street ,' ' || trim(orginal) || ' ' ,' '||trim(convert)||' ' ));  
             street= compress( street    , '_' ); /*** _ means removes word ***/                     
             street= compbl( street ); /*** remove multiple adjacent blanks ***/                     
           %end;                                                                                     
                                                                                                     
         *** %removedb( name ); /*** remove double letters - may not want to do this***/             
                                                                                                     
         *** %dropchar( name, S   ); /*** drop the following characters from the ending of words ***/
                                                                                                     
       run;                                                                                          
                                                                                                     
    proc datasets library=work ; delete stdname stdaddr; quit; /*** delete unwanted datasets ***/    
    %mend stdized;                                                                                   
                                                                                                     
    %stdized; /*** standardize file ***/                                                             
                                                                                                     
   %mend stdlist;                                                                                    
                                                                                                     
                                                                                                     
   %stdlist( master );   /*** Standardize Master File ***/                                           
                                                                                                     
   %stdlist( transact ); /*** Standardize Transaction File ***/                                      



    /*******************************************************************/                                 
                        /***     MEMBER = TOKENIZE.SAS   ***/ 
                                                 
                                                                         
    /*** Tokenize.sas ***/                                               
    OPTIONS LS=80  CENTER DATE NUMBER pagesize=60;                       
    filename datafile "c:\download\merge\transact.csv"; run;             
                                                                         
   DATA TOKEN  (KEEP=TOKEN);                                             
    length string $ 200 name $ 200 street $ 200 cityst $ 200 zipcode $ 5;
    INFILE DATAFILE missover delimiter=';' dsd ;                         
    input  name street cityst zipcode ;                                  
    string = compress( upcase(name), '.');                               
                                                                         
    DO I = 1 TO 100;                                                     
      specchar = " ,<.>/?:;'\|[{]}!@#$%^&*()_-+=~`" || '"';              
      TOKEN = SCAN(STRING,I, ' '      );                                 
      IF TOKEN = ' ' THEN GOTO FINISH;                                   
      if length(trim(token)) < 2 then goto skip; /*** length < 2 ***/    
      OUTPUT TOKEN;                                                      
      skip: ;                                                            
    END;                                                                 
    FINISH: ;                                                            
   RUN;                                                                  
                                                                         
   PROC SORT DATA=TOKEN OUT=TOKEN; BY TOKEN; RUN;                        
                                                                         
   DATA TOKEN; SET TOKEN; BY TOKEN;                                      
    IF FIRST.TOKEN AND LAST.TOKEN THEN DELETE;                           
   RUN;                                                                  
                                                                         
   proc printto file = "tokenize.lis" new; run;                          
                                                                         
   PROC FREQ DATA=TOKEN;  ***ORDER=FREQ;                                 
   TABLES TOKEN / MISSING out=freq noprint;                              
   RUN;                                                                  
                                                                         
   proc print data=freq; run;                                            
                                                                         
   /*** end of program - tokenize ***/