/******************************************************************/ /* Appendix B - SAS Code files for Obswww15 - Patridge */ /* */ /* */ /* 'Source Listings of SAS Programs/Macros for Fuzzy Match' */ /* SOURCE LISTING */ /* FILE REF = PRTFUZZ AND FILE TYPES = SAS */ /* */ /* -------------------- FILEREF=PRTFUZZ ----------------------- */ /* */ /* OBS SRCCODE */ /* */ /* 1 DROPCHAR.SAS */ /* 2 EMPTYYN.SAS */ /* 3 FUZZY.SAS */ /* 4 NORECDS.SAS */ /* 5 READSTD.SAS */ /* 6 REMOVEDB.SAS */ /* 7 SNDSLIKE.SAS */ /* 8 STDLIST.SAS */ /* 9 TOKENIZE.SAS */ /******************************************************************/ /***************************************************************/ /*** MACRO DROPCHAR.SAS ***/ /*** ***/ /*** Delete a string of characters from the end of any word ***/ /***************************************************************/ %macro dropchar( string, char ); zzcnt = 0; zzl = length( "&char" ); do zzw = 1 to 100;/*** how many words are there in string ***/ if ' ' ne scan( &string, zzw) then zzcnt + 1; end; zzw = 0; length zzwordo $ 200 zzword $ 200 ; do zzw = 1 to zzcnt; /*** loop for as many words as found ***/ zzwordo = scan(&string,zzw); /*** keep original word ***/ zzword = zzwordo; /*** new word to be modified ***/ /*** check the last char of the word and set to blank ***/ zzword = left(reverse(zzword)); if substr(zzword,1,zzl)=reverse("&char") then substr(zzword,1,zzl)=' '; zzword = left(reverse(zzword)); zzword = compress(zzword, ' '); /*** remove all internal blanks ***/ /*** replace original word with new word ***/ /*** and remove multiple blanks ***/ &string = compbl (tranwrd( &string,trim(zzwordo),trim(zzword))); end; drop zzl zzw zzcnt zzwordo zzword; /*** drop these vars ***/ %mend dropchar; *********************************************************************; ** PROGRAM: EMPTYYN (MACRO) ; ** AUTHOR: CHUCK PATRIDGE ; ** DATE: 10/12/94 ; ** PURPOSE: DETERMINE IF A SAS DATASET IS EMPTY. ; ** INPUT PARAMETER: DSNAME (NAME OF SAS DATA SET) ; ** ALSO CAN BE BLANK ( USE LAST DATASET CREATED) ; ** OUTPUT: WILL CREATE THE FOLLOWING GLOBAL MACRO VARIABLES ; ** EMPPTYYN Y=EMPTY, N=NONEMPTY ; ** NUMOBS WHICH PROVIDES NUMBER OF RECORDS ; ** DSN WHICH PROVIDES NAME OF DATA SET ; ** SAMPLE CALL: %EMPTYYN(DATASET) ; ** ; *********************************************************************; %GLOBAL EMPTYYN NUMOBS DSN; %MACRO EMPTYYN(DSNAME); DATA _NULL_; IF "&DSNAME " = " " THEN CALL SYMPUT('DSNAME','_LAST_'); RUN; DATA _NULL_; IF 0 THEN SET &DSNAME NOBS=NUMOBS; IF NUMOBS > 0 THEN EMPTYYN = 'N'; ELSE EMPTYYN = 'Y'; CALL SYMPUT('EMPTYYN',PUT(EMPTYYN, $1.)); CALL SYMPUT('NUMOBS' ,PUT(NUMOBS , BEST.)); CALL SYMPUT("DSN" ,PUT("&DSNAME" , $VARYING17.)); RUN; %MEND EMPTYYN; RUN; ******** END OF Program - Emptyyn ***********; /*******************************************************************/ /***fuzzy.sas ***/ /*** ***/ /*** Author: Charles Patridge ***/ /*** PDPC, Ltd. ***/ /*** 172 Monce Road ***/ /*** Burlington, CT 06013 ***/ /*** Home: 860-673-9278 or 860-675-9026 ***/ /*** Email: TVJB41A@prodigy.com ***/ /*** Email: Charles_S_Patridge@prodigy.com ***/ /*** ***/ /*** Copyrighted 1984, 1989, 1993, 1994, 1996, 1998 ***/ /*** With Author's permission, you may use this program ***/ /*** as agreed upon by Author. It is illegal to distribute ***/ /*** this program for profit or misrepresent the original ***/ /*** owner/author of this Fuzzy Merge Routine(s). ***/ /*** ***/ /*** Match potential records between two files , duplicates ***/ /*** After Files have been standardized Names, Addresses, State ***/ /*** Find number of words found in Name and Address fields ***/ /*** Find Pct of words over all words ***/ /*** ***/ /*** Ignore finding a record against itself (dup macro var) ***/ /*** Search Only records that have the same 1st "3" ***/ /*** characters of zipcode (USA zip codes) ***/ /*** and have same gender and birthdate ***/ /*** ***/ /*** To Determine Duplicates, First Name or Last Name ***/ /*** must be matched (order of appearance makes no ***/ /*** difference). Initials will throw off a match. ***/ /*** ***/ /*** This routines employs the use of the soundex function ***/ /*** to compensate for like sounding names. ***/ /*** ***/ /*******************************************************************/ Libname merge "C:\download\merge"; run; filename macros "C:\download\merge"; run; %include macros( 'emptyyn.sas' ); /*** Determine NOBS of a DATASET ***/ %include macros( 'sndslike.sas' ); /*** Soundex Function Macro ***/ %include macros( 'norecds.sas' ); /*** display no dups found notification ***/ %let trnxs = transact ; /***Transaction Data File - after standardization ***/ %let master = master ; /***Master Data File - after standardization ***/ %let dup = within ; /***if checking for duplicates and Master/Transact are same File ***/ ****%let dup = external ; /*** if checking for matches and Master/Transact are different Files ***/ %let ziparea = 3; /*** How much of a Zip Area to search duplicates ***/ /*** smaller # means larger area, larger # means smaller area ***/ /*** ranges from 1 to 5 ***/ %let maxword = 20; /*** maximum number of words in a given sas variable ***/ data merge.&trnxs; /*** Transaction File ***/ set merge.&trnxs; recdno = _n_; /*** get record position - used as an identifier for future use ***/ run; data merge.&master; /*** Master File ***/ set merge.&master; recdno = _n_; /*** get record position - used as an identifier for future use ***/ run; %emptyyn( merge.&trnxs ); /*** how many transaction records ***/ %let notnxs = &numobs ; /*** set notnxs to number of trnxs records ***/ proc datasets library=work; delete matches; run; /*** start with empty matches file ***/ %macro match; %do loopk = 1 %to ¬nxs ; /*** Loop thru Transaction File record by record ***/ data _null_; recd = &loopk ; set merge.&trnxs point=recd; /*** get record by physical pointer ***/ call symput( 'zipcode' , substr( zipcode, 1, &ziparea) ); /*** get partial of zipcode ***/ stop; run; /*** Get Master File Records where Zipcode is similar to Transaction Record ***/ data srchfile(drop=name street city state zipcode recdno); set merge.&master(where=(substr(zipcode,1,&ziparea)="&zipcode") ); _recdno = recdno; /*** use as an identifier ***/ /*** re-assign variable names so as to be able to match words ***/ _name = name; _street = street; _city = city; _state = state; _zipcode= zipcode; run; %emptyyn( work.srchfile ); %let nosrch = &numobs ; data selrecds(drop=ss nn nss); length word $ 200. ; retain checkstr; recd = &loopk ; set merge.&trnxs point=recd; ss = 0; Do ss = 1 to &nosrch ; checkstr = 0; recdss = ss ; set srchfile point=recdss; if "&dup" = 'within' and recdno = _recdno then goto skiprecd; /*** same record ***/ do nn = 1 to &maxword; /*** try to match part of name ***/ noname = 0; noname2 = 0; nostre = 0; word = scan( name , nn ); if word eq ' ' then goto skiprecd; /*** no more words to process ***/ if (word ne ' ') and (length(trim(word)) > 1) and 0 ne index( _name , trim(word) ) then noname=1; /*** check to see if first or last name match ***/ if nn in (1 2) and noname = 1 then do; if nn = 1 then word2nd = scan( name, nn+1); if nn = 2 then word2nd = scan( name, nn-1); if (word2nd ne ' ') and (length(trim(word2nd)) > 1) and 0 ne index( _name , trim(word2nd) ) then noname2=1; if noname = 1 or noname2 = 1 then checkstr + 1; end; if noname = 1 then output; noname = 0; /*** force vowel processing ***/ /*** if no hit, squeeze vowels and try again ***/ if noname = 0 then do; word = compress( word, 'AEIOUY'); if (word ne ' ') and (length(trim(word)) > 1) and 0 ne index( compress(_name, 'AEIOUY') , trim(word) ) then noname=1; /*** Now use the SoundEx function as last resort ***/ if noname = 0 then do; likeword = scan( name, nn ); word = trim(likeword) || '*'; do inn = 1 to &maxword; /*** use soundex function if vowels failed ***/ likeness = scan( _name, inn); if likeness = ' ' then goto skipsnds; if likeword ne ' ' and likeness ne ' ' then do; %sndslike( likeword, likeness ); end; end; skipsnds:; end; /*** check to see if first or last name match ***/ if nn in (1 2) and noname > 0 then do; word2nd = compress( word2nd, 'AEIOUY'); if (word2nd ne ' ') and (length(trim(word2nd)) > 1) and 0 ne index( compress(_name, 'AEIOUY') , trim(word2nd) ) then noname2=1; if noname > 0 or noname2 = 1 then checkstr + 1; end; if noname > 0 then output; end; if checkstr > 1 then do; /*** now check street address for possible words ***/ do nss = 1 to &maxword; nostre = 0; noname = 0; /*** try to match part of street ***/ word = scan( street, nss); if word eq ' ' then goto skipstre; if (word ne ' ') and (length(trim(word)) > 1) and 0 ne index( _street, trim(word) ) then nostre=1; if nostre = 1 then output; /*** if no hit, squeeze vowels and try again ***/ if nostre = 0 then do; word = compress( word, 'AEIOUY'); if (word ne ' ') and (length(trim(word)) > 1) and 0 ne index( compress(_street, 'AEIOUY') , trim(word) ) then nostre=1; if nostre = 1 then output; end; /*** do not bother with SoundEx function on street address ***/ end; end; skipstre:; end; skiprecd:; end; stop; ; run; proc append base = matches data=selrecds; run; /*** add found words to matches ***/ %end; %mend match; %match; proc sort data=matches out=matches; by recdno _recdno word; run; /***count number of words found for name and street and total words ***/ /***create pct words found and rank records ***/ data dups(drop=word i noname nostre checkstr noname2 word2nd likeword inn likeness sdf lsdf sds lsds); length matchstr $ 200. ; retain matchstr; set matches; by recdno _recdno word; if first._recdno then matchstr = ' '; if first._recdno then do; cntname = 0; cntstre = 0; end; /*** build a character string of words used to find possible match ***/ if first.word and word ne ' ' then matchstr = trim( matchstr ) || trim(word) || ','; cntname + noname ; cntstre + nostre ; if last._recdno then do; /*** keep only the last duplicate record of master record ***/ cntword = 0; do i = 1 to &maxword; if ' ' ne scan( name , i ) then cntword + 1; if ' ' ne scan( street, i ) then cntword + 1; end; pctword = sum(of cntname cntstre) / cntword ; /*** calculate pct words ***/ matchstr = left( substr(right(matchstr),1,200)); output; end; run; proc sort data=dups out=merge.dups; by recdno descending pctword; /*** rank by descending PCTWORD ***/ run; proc datasets library=work; delete srchfile selrecds matches dups; /*** delete unwanted datasets ***/ quit; /*** Now create a simple report of records found as a possible match ***/ proc printto file="c:\download\merge\dups.lis" new; run; options ls=72 nodate center number pageno=1; title1 "Transactions Records that potentially could match Master Records"; title2 "1st 3 lines are from Transaction File"; title3 "2nd 3 lines are from Master File"; title4 "Last Line are words found that determined possible match"; title5 "Words with * caught with soundex function"; %emptyyn( merge.dups ); %norecds; /*** Display Report if No Duplicate Records Exist ***/ proc forms data=merge.dups width=120 lines=9 between=1 pagesize=60; line 1 recdno name; /*** Transaction Record ***/ line 2 street / indent=13; line 3 city state zipcode / pack indent=13; line 4 _recdno _name ; /*** Master Record ***/ line 5 _street / indent=13; line 6 _city _state _zipcode / pack indent=13; line 7 matchstr ; line 8 '________________________________'; run; proc printto; run; /*** end of program - fuzzy ***/ /*******************************************************************/ /*** MEMBER = NORECDS.SAS ***/ %macro norecds; %if &numobs = 0 %then %do; data _null_; file print; put "Found No Records that could potentially be duplicated"; put "Hence, there are no records to display at this time"; run; %end; %mend norecds; /*******************************************************************/ /*** MEMBER = READSTD.SAS ***/ /*** readstd.sas ***/ /*** read file with words that need to be eliminated ***/ %macro readstd ( fnames ); filename stdfiles "c:\download\merge\&fnames..csv"; run; data &fnames ; length orginal $ 200. convert $ 200. ; infile stdfiles delimiter=',' dsd missover; input orginal convert ; orginal = upcase(orginal); convert = upcase(convert); run; %mend readstd; /*** end of program - readstd ***/ /***************************************************************/ /*** MACRO REMOVEDB.SAS ***/ /*** ***/ /*** Parameter STRING is the character string you wish to ***/ /*** have doubles removed from. ***/ /***************************************************************/ %macro removedb( string ); zzcnt = 0; do zzw = 1 to 100;/*** how many words are there in string ***/ if ' ' ne scan( &string, zzw) then zzcnt + 1; end; zzw = 0; length zzwordo $ 200 zzword $ 200 ; do zzw = 1 to zzcnt; /*** loop for as many words as found ***/ zzwordo = scan(&string,zzw); /*** keep original word ***/ zzword = zzwordo; /*** new word to be modified ***/ do zzl = 1 to length(trim(left(zzword)))-1; /*** loop length of word ***/ /*** set first occurrence of double letter to blank ***/ if substr(zzword ,zzl,1)=substr(zzword ,zzl+1,1) then substr(zzword ,zzl,1)=' '; end; zzword = compress(zzword, ' '); /*** remove all internal blanks ***/ /*** replace original word with new word ***/ /*** and remove multiple blanks ***/ &string = compbl (tranwrd( &string,trim(zzwordo),trim(zzword))); end; drop zzl zzw zzcnt zzwordo zzword; /*** drop these vars ***/ %mend removedb; /*******************************************************************/ /*** MEMBER = SNDSLIKE.SAS ***/ %macro sndslike( first, second ); sdf= soundex(&first); lsdf= substr(sdf,2); sds= soundex(&second); lsds= substr(sds,2); if length(trim(lsdf)) > 1 and length(trim(lsds)) > 1 then do; if ( 0 ne index(trim(lsdf), trim(lsds))) or ( 0 ne index(trim(lsds), trim(lsdf))) then noname + 1; end; %mend sndslike; /********************************************************************/ /***stdlist.sas ***/ /*** ***/ /*** Author: Charles Patridge ***/ /*** PDPC, Ltd. ***/ /*** 172 Monce Road ***/ /*** Burlington, CT 06013 ***/ /*** Home: 860-673-9278 or 860-675-9026 ***/ /*** Email: TVJB41A@prodigy.com ***/ /*** Email: Charles_S_Patridge@prodigy.com ***/ /*** ***/ /*** Copyrighted 1984, 1989, 1993, 1994, 1996, 1998 ***/ /*** With Author's permission, you may use this program ***/ /*** as agreed upon by Author. It is illegal to distribute ***/ /*** this program for profit or misrepresent the original ***/ /*** owner/author of this Fuzzy Merge Routine(s). ***/ /*** ***/ /*** In order to make Fuzzy Match Process Work, standardization ***/ /*** Names and Street Addresses is the critical key to success ***/ /*** Names are converted to shorten versions where possible ***/ /*** Addresses are shorten and common words like road, street ***/ /*** drive, etc. are eliminated from address field to reduce ***/ /*** the number of possible false hits. ***/ /*** ***/ /*** When dealing with Company Names, standardization is also ***/ /*** critical. Common Words for Company Names need to be re- ***/ /*** moved; such COMPANY, LIMITED, ASSOCIATES, etc ***/ /*** In addition, word endings such as S, ES, IES, ING, TION ***/ /*** should be dropped using the DROPCHAR routine. ***/ /*** Also, it might be useful to remove any double letters found***/ /*** within a company name and use REMOVEDB routine to do this ***/ /*** ***/ /*** Also need to remove special characters and make every ***/ /*** word UPPER CASE to get exact matching made easier ***/ /*** ***/ /*** Suggest making each field longer than necessary in order to***/ /*** actually split a word into two words, ***/ /*** like HONGKONG set to HONG KONG to increase matching hits ***/ /*******************************************************************/ libname merge "c:\download\merge" ; run; filename macros "c:\download\merge" ; run; %include macros( 'emptyyn.sas' ); %include macros( 'dropchar.sas' ); %include macros( 'removedb.sas' ); %include macros( 'readstd.sas' ); /*** Read File ***/ %macro stdlist( fname ); filename &fname "c:\download\merge\&fname..csv"; run; DATA &fname; length name $200 street $200 city $200 state $200 zipcode $5 ; INFILE &fname missover delimiter=',' dsd ; input name street city state zipcode; name = upcase(name ); street = upcase(street); city = upcase(city ); *** state = upcase(state ); state = zipstate( zipcode ); /*** use zipcode to get State Abbreviation ***/ run; /*** eliminate exact duplicate records of names using NODUPKEY ***/ /*** no sense in processing matching process for 2 or more identical records ***/ PROC SORT DATA=&fname OUT=&fname NODUPKEY; by name street zipcode; run; data &fname(drop=i spechars); set &fname; spechars = ',!@#$%^&*()_+-={}[]:;"|\<>.?/~`' || "'"; do i = 1 to length(spechars); name = translate(name , ' ' , substr(spechars,i) ); /*** make spec chars blank ***/ street = translate(street , ' ' , substr(spechars,i) ); /*** make spec chars blank ***/ end; name = compbl( name ); /*** remove multiple internal blanks ***/ street = compbl( street ); /*** remove multiple internal blanks ***/ run; %emptyyn( &fname ); /*** how many rawdata records ***/ %let notnxs = &numobs ; /*** set notnxs to number of records in file ***/ %readstd( stdname ); /*** Read Standardized Name File ***/ %emptyyn( stdname ); /*** How many Standardized Names are there ***/ %let noname = &numobs ; /*** set noname to number of standardized name records ***/ %readstd( stdaddr ); /*** Read Standardized Address File ***/ %emptyyn( stdaddr ); /*** How many Standardized Addresses are there ***/ %let noaddr = &numobs ; /*** set noaddr to number of standardized address records ***/ /*** if dealing with company names, then add code to process standardized company names ***/ /*** use stdbusn file to do this ***/ %macro stdized; /*** orginal will be made into what is in convert ***/ data merge.&fname(drop=in is orginal convert ); set &fname; in = 0; %do in = 1 %to &noname; recd = &in ; set stdname point=recd; name = ' ' || trim(name); /*** add a blank to front of word ***/ name = left( tranwrd(name ,' ' || trim(orginal) || ' ' ,' '||trim(convert)||' ' )); name = compress( name , '_' ); /*** _ means removes word ***/ name = compbl ( name ); /*** remove multiple adjacent blanks ***/ %end; is = 0; %do is = 1 %to &noaddr; recd = &is ; set stdaddr point=recd; street= ' ' || trim(street); /*** add a blank to front of word ***/ street= left( tranwrd(street ,' ' || trim(orginal) || ' ' ,' '||trim(convert)||' ' )); street= compress( street , '_' ); /*** _ means removes word ***/ street= compbl( street ); /*** remove multiple adjacent blanks ***/ %end; *** %removedb( name ); /*** remove double letters - may not want to do this***/ *** %dropchar( name, S ); /*** drop the following characters from the ending of words ***/ run; proc datasets library=work ; delete stdname stdaddr; quit; /*** delete unwanted datasets ***/ %mend stdized; %stdized; /*** standardize file ***/ %mend stdlist; %stdlist( master ); /*** Standardize Master File ***/ %stdlist( transact ); /*** Standardize Transaction File ***/ /*******************************************************************/ /*** MEMBER = TOKENIZE.SAS ***/ /*** Tokenize.sas ***/ OPTIONS LS=80 CENTER DATE NUMBER pagesize=60; filename datafile "c:\download\merge\transact.csv"; run; DATA TOKEN (KEEP=TOKEN); length string $ 200 name $ 200 street $ 200 cityst $ 200 zipcode $ 5; INFILE DATAFILE missover delimiter=';' dsd ; input name street cityst zipcode ; string = compress( upcase(name), '.'); DO I = 1 TO 100; specchar = " ,<.>/?:;'\|[{]}!@#$%^&*()_-+=~`" || '"'; TOKEN = SCAN(STRING,I, ' ' ); IF TOKEN = ' ' THEN GOTO FINISH; if length(trim(token)) < 2 then goto skip; /*** length < 2 ***/ OUTPUT TOKEN; skip: ; END; FINISH: ; RUN; PROC SORT DATA=TOKEN OUT=TOKEN; BY TOKEN; RUN; DATA TOKEN; SET TOKEN; BY TOKEN; IF FIRST.TOKEN AND LAST.TOKEN THEN DELETE; RUN; proc printto file = "tokenize.lis" new; run; PROC FREQ DATA=TOKEN; ***ORDER=FREQ; TABLES TOKEN / MISSING out=freq noprint; RUN; proc print data=freq; run; /*** end of program - tokenize ***/