/******************************************************************/
/* Appendix B - SAS Code files for Obswww15 - Patridge */
/* */
/* */
/* 'Source Listings of SAS Programs/Macros for Fuzzy Match' */
/* SOURCE LISTING */
/* FILE REF = PRTFUZZ AND FILE TYPES = SAS */
/* */
/* -------------------- FILEREF=PRTFUZZ ----------------------- */
/* */
/* OBS SRCCODE */
/* */
/* 1 DROPCHAR.SAS */
/* 2 EMPTYYN.SAS */
/* 3 FUZZY.SAS */
/* 4 NORECDS.SAS */
/* 5 READSTD.SAS */
/* 6 REMOVEDB.SAS */
/* 7 SNDSLIKE.SAS */
/* 8 STDLIST.SAS */
/* 9 TOKENIZE.SAS */
/******************************************************************/
/***************************************************************/
/*** MACRO DROPCHAR.SAS ***/
/*** ***/
/*** Delete a string of characters from the end of any word ***/
/***************************************************************/
%macro dropchar( string, char );
zzcnt = 0;
zzl = length( "&char" );
do zzw = 1 to 100;/*** how many words are there in string ***/
if ' ' ne scan( &string, zzw) then zzcnt + 1;
end;
zzw = 0;
length zzwordo $ 200 zzword $ 200 ;
do zzw = 1 to zzcnt; /*** loop for as many words as found ***/
zzwordo = scan(&string,zzw); /*** keep original word ***/
zzword = zzwordo; /*** new word to be modified ***/
/*** check the last char of the word and set to blank ***/
zzword = left(reverse(zzword));
if substr(zzword,1,zzl)=reverse("&char") then substr(zzword,1,zzl)=' ';
zzword = left(reverse(zzword));
zzword = compress(zzword, ' '); /*** remove all internal blanks ***/
/*** replace original word with new word ***/
/*** and remove multiple blanks ***/
&string = compbl (tranwrd( &string,trim(zzwordo),trim(zzword)));
end;
drop zzl zzw zzcnt zzwordo zzword; /*** drop these vars ***/
%mend dropchar;
*********************************************************************;
** PROGRAM: EMPTYYN (MACRO) ;
** AUTHOR: CHUCK PATRIDGE ;
** DATE: 10/12/94 ;
** PURPOSE: DETERMINE IF A SAS DATASET IS EMPTY. ;
** INPUT PARAMETER: DSNAME (NAME OF SAS DATA SET) ;
** ALSO CAN BE BLANK ( USE LAST DATASET CREATED) ;
** OUTPUT: WILL CREATE THE FOLLOWING GLOBAL MACRO VARIABLES ;
** EMPPTYYN Y=EMPTY, N=NONEMPTY ;
** NUMOBS WHICH PROVIDES NUMBER OF RECORDS ;
** DSN WHICH PROVIDES NAME OF DATA SET ;
** SAMPLE CALL: %EMPTYYN(DATASET) ;
** ;
*********************************************************************;
%GLOBAL EMPTYYN NUMOBS DSN;
%MACRO EMPTYYN(DSNAME);
DATA _NULL_;
IF "&DSNAME " = " " THEN CALL SYMPUT('DSNAME','_LAST_');
RUN;
DATA _NULL_;
IF 0 THEN SET &DSNAME NOBS=NUMOBS;
IF NUMOBS > 0 THEN EMPTYYN = 'N';
ELSE EMPTYYN = 'Y';
CALL SYMPUT('EMPTYYN',PUT(EMPTYYN, $1.));
CALL SYMPUT('NUMOBS' ,PUT(NUMOBS , BEST.));
CALL SYMPUT("DSN" ,PUT("&DSNAME" , $VARYING17.));
RUN;
%MEND EMPTYYN; RUN;
******** END OF Program - Emptyyn ***********;
/*******************************************************************/
/***fuzzy.sas ***/
/*** ***/
/*** Author: Charles Patridge ***/
/*** PDPC, Ltd. ***/
/*** 172 Monce Road ***/
/*** Burlington, CT 06013 ***/
/*** Home: 860-673-9278 or 860-675-9026 ***/
/*** Email: TVJB41A@prodigy.com ***/
/*** Email: Charles_S_Patridge@prodigy.com ***/
/*** ***/
/*** Copyrighted 1984, 1989, 1993, 1994, 1996, 1998 ***/
/*** With Author's permission, you may use this program ***/
/*** as agreed upon by Author. It is illegal to distribute ***/
/*** this program for profit or misrepresent the original ***/
/*** owner/author of this Fuzzy Merge Routine(s). ***/
/*** ***/
/*** Match potential records between two files , duplicates ***/
/*** After Files have been standardized Names, Addresses, State ***/
/*** Find number of words found in Name and Address fields ***/
/*** Find Pct of words over all words ***/
/*** ***/
/*** Ignore finding a record against itself (dup macro var) ***/
/*** Search Only records that have the same 1st "3" ***/
/*** characters of zipcode (USA zip codes) ***/
/*** and have same gender and birthdate ***/
/*** ***/
/*** To Determine Duplicates, First Name or Last Name ***/
/*** must be matched (order of appearance makes no ***/
/*** difference). Initials will throw off a match. ***/
/*** ***/
/*** This routines employs the use of the soundex function ***/
/*** to compensate for like sounding names. ***/
/*** ***/
/*******************************************************************/
Libname merge "C:\download\merge"; run;
filename macros "C:\download\merge"; run;
%include macros( 'emptyyn.sas' ); /*** Determine NOBS of a DATASET ***/
%include macros( 'sndslike.sas' ); /*** Soundex Function Macro ***/
%include macros( 'norecds.sas' ); /*** display no dups found notification ***/
%let trnxs = transact ; /***Transaction Data File - after standardization ***/
%let master = master ; /***Master Data File - after standardization ***/
%let dup = within ; /***if checking for duplicates and Master/Transact are same File ***/
****%let dup = external ; /*** if checking for matches and Master/Transact are different Files ***/
%let ziparea = 3; /*** How much of a Zip Area to search duplicates ***/
/*** smaller # means larger area, larger # means smaller area ***/
/*** ranges from 1 to 5 ***/
%let maxword = 20; /*** maximum number of words in a given sas variable ***/
data merge.&trnxs; /*** Transaction File ***/
set merge.&trnxs;
recdno = _n_; /*** get record position - used as an identifier for future use ***/
run;
data merge.&master; /*** Master File ***/
set merge.&master;
recdno = _n_; /*** get record position - used as an identifier for future use ***/
run;
%emptyyn( merge.&trnxs ); /*** how many transaction records ***/
%let notnxs = &numobs ; /*** set notnxs to number of trnxs records ***/
proc datasets library=work; delete matches; run; /*** start with empty matches file ***/
%macro match;
%do loopk = 1 %to ¬nxs ; /*** Loop thru Transaction File record by record ***/
data _null_;
recd = &loopk ;
set merge.&trnxs point=recd; /*** get record by physical pointer ***/
call symput( 'zipcode' , substr( zipcode, 1, &ziparea) ); /*** get partial of zipcode ***/
stop;
run;
/*** Get Master File Records where Zipcode is similar to Transaction Record ***/
data srchfile(drop=name street city state zipcode recdno);
set merge.&master(where=(substr(zipcode,1,&ziparea)="&zipcode") );
_recdno = recdno; /*** use as an identifier ***/
/*** re-assign variable names so as to be able to match words ***/
_name = name;
_street = street;
_city = city;
_state = state;
_zipcode= zipcode;
run;
%emptyyn( work.srchfile );
%let nosrch = &numobs ;
data selrecds(drop=ss nn nss);
length word $ 200. ;
retain checkstr;
recd = &loopk ;
set merge.&trnxs point=recd;
ss = 0;
Do ss = 1 to &nosrch ;
checkstr = 0;
recdss = ss ;
set srchfile point=recdss;
if "&dup" = 'within' and recdno = _recdno then goto skiprecd; /*** same record ***/
do nn = 1 to &maxword;
/*** try to match part of name ***/
noname = 0;
noname2 = 0;
nostre = 0;
word = scan( name , nn );
if word eq ' ' then goto skiprecd; /*** no more words to process ***/
if (word ne ' ') and
(length(trim(word)) > 1) and
0 ne index( _name , trim(word) ) then noname=1;
/*** check to see if first or last name match ***/
if nn in (1 2) and noname = 1 then do;
if nn = 1 then word2nd = scan( name, nn+1);
if nn = 2 then word2nd = scan( name, nn-1);
if (word2nd ne ' ') and
(length(trim(word2nd)) > 1) and
0 ne index( _name , trim(word2nd) ) then noname2=1;
if noname = 1 or noname2 = 1 then checkstr + 1;
end;
if noname = 1 then output;
noname = 0; /*** force vowel processing ***/
/*** if no hit, squeeze vowels and try again ***/
if noname = 0 then do;
word = compress( word, 'AEIOUY');
if (word ne ' ') and
(length(trim(word)) > 1) and
0 ne index( compress(_name, 'AEIOUY') , trim(word) ) then noname=1;
/*** Now use the SoundEx function as last resort ***/
if noname = 0 then do;
likeword = scan( name, nn );
word = trim(likeword) || '*';
do inn = 1 to &maxword; /*** use soundex function if vowels failed ***/
likeness = scan( _name, inn);
if likeness = ' ' then goto skipsnds;
if likeword ne ' ' and likeness ne ' ' then do;
%sndslike( likeword, likeness );
end;
end;
skipsnds:;
end;
/*** check to see if first or last name match ***/
if nn in (1 2) and noname > 0 then do;
word2nd = compress( word2nd, 'AEIOUY');
if (word2nd ne ' ') and
(length(trim(word2nd)) > 1) and
0 ne index( compress(_name, 'AEIOUY') , trim(word2nd) ) then noname2=1;
if noname > 0 or noname2 = 1 then checkstr + 1;
end;
if noname > 0 then output;
end;
if checkstr > 1 then do; /*** now check street address for possible words ***/
do nss = 1 to &maxword;
nostre = 0;
noname = 0;
/*** try to match part of street ***/
word = scan( street, nss);
if word eq ' ' then goto skipstre;
if (word ne ' ') and
(length(trim(word)) > 1) and
0 ne index( _street, trim(word) ) then nostre=1;
if nostre = 1 then output;
/*** if no hit, squeeze vowels and try again ***/
if nostre = 0 then do;
word = compress( word, 'AEIOUY');
if (word ne ' ') and
(length(trim(word)) > 1) and
0 ne index( compress(_street, 'AEIOUY') , trim(word) ) then nostre=1;
if nostre = 1 then output;
end;
/*** do not bother with SoundEx function on street address ***/
end;
end;
skipstre:;
end;
skiprecd:;
end;
stop; ;
run;
proc append base = matches data=selrecds; run; /*** add found words to matches ***/
%end;
%mend match;
%match;
proc sort data=matches out=matches;
by recdno _recdno word;
run;
/***count number of words found for name and street and total words ***/
/***create pct words found and rank records ***/
data dups(drop=word i noname nostre checkstr noname2 word2nd likeword inn likeness
sdf lsdf sds lsds);
length matchstr $ 200. ;
retain matchstr;
set matches;
by recdno _recdno word;
if first._recdno then matchstr = ' ';
if first._recdno then do;
cntname = 0;
cntstre = 0;
end;
/*** build a character string of words used to find possible match ***/
if first.word and word ne ' ' then matchstr = trim( matchstr ) || trim(word) || ',';
cntname + noname ;
cntstre + nostre ;
if last._recdno then do; /*** keep only the last duplicate record of master record ***/
cntword = 0;
do i = 1 to &maxword;
if ' ' ne scan( name , i ) then cntword + 1;
if ' ' ne scan( street, i ) then cntword + 1;
end;
pctword = sum(of cntname cntstre) / cntword ; /*** calculate pct words ***/
matchstr = left( substr(right(matchstr),1,200));
output;
end;
run;
proc sort data=dups out=merge.dups;
by recdno descending pctword; /*** rank by descending PCTWORD ***/
run;
proc datasets library=work;
delete srchfile selrecds matches dups; /*** delete unwanted datasets ***/
quit;
/*** Now create a simple report of records found as a possible match ***/
proc printto file="c:\download\merge\dups.lis" new; run;
options ls=72 nodate center number pageno=1;
title1 "Transactions Records that potentially could match Master Records";
title2 "1st 3 lines are from Transaction File";
title3 "2nd 3 lines are from Master File";
title4 "Last Line are words found that determined possible match";
title5 "Words with * caught with soundex function";
%emptyyn( merge.dups );
%norecds; /*** Display Report if No Duplicate Records Exist ***/
proc forms data=merge.dups width=120 lines=9 between=1 pagesize=60;
line 1 recdno name; /*** Transaction Record ***/
line 2 street / indent=13;
line 3 city state zipcode / pack indent=13;
line 4 _recdno _name ; /*** Master Record ***/
line 5 _street / indent=13;
line 6 _city _state _zipcode / pack indent=13;
line 7 matchstr ;
line 8 '________________________________';
run;
proc printto; run;
/*** end of program - fuzzy ***/
/*******************************************************************/
/*** MEMBER = NORECDS.SAS ***/
%macro norecds;
%if &numobs = 0 %then %do;
data _null_;
file print;
put "Found No Records that could potentially be duplicated";
put "Hence, there are no records to display at this time";
run;
%end;
%mend norecds;
/*******************************************************************/
/*** MEMBER = READSTD.SAS ***/
/*** readstd.sas ***/
/*** read file with words that need to be eliminated ***/
%macro readstd ( fnames );
filename stdfiles "c:\download\merge\&fnames..csv"; run;
data &fnames ;
length orginal $ 200. convert $ 200. ;
infile stdfiles delimiter=',' dsd missover;
input orginal convert ;
orginal = upcase(orginal);
convert = upcase(convert);
run;
%mend readstd;
/*** end of program - readstd ***/
/***************************************************************/
/*** MACRO REMOVEDB.SAS ***/
/*** ***/
/*** Parameter STRING is the character string you wish to ***/
/*** have doubles removed from. ***/
/***************************************************************/
%macro removedb( string );
zzcnt = 0;
do zzw = 1 to 100;/*** how many words are there in string ***/
if ' ' ne scan( &string, zzw) then zzcnt + 1;
end;
zzw = 0;
length zzwordo $ 200 zzword $ 200 ;
do zzw = 1 to zzcnt; /*** loop for as many words as found ***/
zzwordo = scan(&string,zzw); /*** keep original word ***/
zzword = zzwordo; /*** new word to be modified ***/
do zzl = 1 to length(trim(left(zzword)))-1; /*** loop length of word ***/
/*** set first occurrence of double letter to blank ***/
if substr(zzword ,zzl,1)=substr(zzword ,zzl+1,1) then
substr(zzword ,zzl,1)=' ';
end;
zzword = compress(zzword, ' '); /*** remove all internal blanks ***/
/*** replace original word with new word ***/
/*** and remove multiple blanks ***/
&string = compbl (tranwrd( &string,trim(zzwordo),trim(zzword)));
end;
drop zzl zzw zzcnt zzwordo zzword; /*** drop these vars ***/
%mend removedb;
/*******************************************************************/
/*** MEMBER = SNDSLIKE.SAS ***/
%macro sndslike( first, second );
sdf= soundex(&first);
lsdf= substr(sdf,2);
sds= soundex(&second);
lsds= substr(sds,2);
if length(trim(lsdf)) > 1 and length(trim(lsds)) > 1 then do;
if ( 0 ne index(trim(lsdf), trim(lsds))) or
( 0 ne index(trim(lsds), trim(lsdf))) then noname + 1;
end;
%mend sndslike;
/********************************************************************/
/***stdlist.sas ***/
/*** ***/
/*** Author: Charles Patridge ***/
/*** PDPC, Ltd. ***/
/*** 172 Monce Road ***/
/*** Burlington, CT 06013 ***/
/*** Home: 860-673-9278 or 860-675-9026 ***/
/*** Email: TVJB41A@prodigy.com ***/
/*** Email: Charles_S_Patridge@prodigy.com ***/
/*** ***/
/*** Copyrighted 1984, 1989, 1993, 1994, 1996, 1998 ***/
/*** With Author's permission, you may use this program ***/
/*** as agreed upon by Author. It is illegal to distribute ***/
/*** this program for profit or misrepresent the original ***/
/*** owner/author of this Fuzzy Merge Routine(s). ***/
/*** ***/
/*** In order to make Fuzzy Match Process Work, standardization ***/
/*** Names and Street Addresses is the critical key to success ***/
/*** Names are converted to shorten versions where possible ***/
/*** Addresses are shorten and common words like road, street ***/
/*** drive, etc. are eliminated from address field to reduce ***/
/*** the number of possible false hits. ***/
/*** ***/
/*** When dealing with Company Names, standardization is also ***/
/*** critical. Common Words for Company Names need to be re- ***/
/*** moved; such COMPANY, LIMITED, ASSOCIATES, etc ***/
/*** In addition, word endings such as S, ES, IES, ING, TION ***/
/*** should be dropped using the DROPCHAR routine. ***/
/*** Also, it might be useful to remove any double letters found***/
/*** within a company name and use REMOVEDB routine to do this ***/
/*** ***/
/*** Also need to remove special characters and make every ***/
/*** word UPPER CASE to get exact matching made easier ***/
/*** ***/
/*** Suggest making each field longer than necessary in order to***/
/*** actually split a word into two words, ***/
/*** like HONGKONG set to HONG KONG to increase matching hits ***/
/*******************************************************************/
libname merge "c:\download\merge" ; run;
filename macros "c:\download\merge" ; run;
%include macros( 'emptyyn.sas' );
%include macros( 'dropchar.sas' );
%include macros( 'removedb.sas' );
%include macros( 'readstd.sas' );
/*** Read File ***/
%macro stdlist( fname );
filename &fname "c:\download\merge\&fname..csv"; run;
DATA &fname;
length name $200 street $200 city $200 state $200 zipcode $5 ;
INFILE &fname missover delimiter=',' dsd ;
input name street city state zipcode;
name = upcase(name );
street = upcase(street);
city = upcase(city );
*** state = upcase(state );
state = zipstate( zipcode ); /*** use zipcode to get State Abbreviation ***/
run;
/*** eliminate exact duplicate records of names using NODUPKEY ***/
/*** no sense in processing matching process for 2 or more identical records ***/
PROC SORT DATA=&fname OUT=&fname NODUPKEY; by name street zipcode; run;
data &fname(drop=i spechars);
set &fname;
spechars = ',!@#$%^&*()_+-={}[]:;"|\<>.?/~`' || "'";
do i = 1 to length(spechars);
name = translate(name , ' ' , substr(spechars,i) ); /*** make spec chars blank ***/
street = translate(street , ' ' , substr(spechars,i) ); /*** make spec chars blank ***/
end;
name = compbl( name ); /*** remove multiple internal blanks ***/
street = compbl( street ); /*** remove multiple internal blanks ***/
run;
%emptyyn( &fname ); /*** how many rawdata records ***/
%let notnxs = &numobs ; /*** set notnxs to number of records in file ***/
%readstd( stdname ); /*** Read Standardized Name File ***/
%emptyyn( stdname ); /*** How many Standardized Names are there ***/
%let noname = &numobs ; /*** set noname to number of standardized name records ***/
%readstd( stdaddr ); /*** Read Standardized Address File ***/
%emptyyn( stdaddr ); /*** How many Standardized Addresses are there ***/
%let noaddr = &numobs ; /*** set noaddr to number of standardized address records ***/
/*** if dealing with company names, then add code to process standardized company names ***/
/*** use stdbusn file to do this ***/
%macro stdized;
/*** orginal will be made into what is in convert ***/
data merge.&fname(drop=in is orginal convert );
set &fname;
in = 0;
%do in = 1 %to &noname;
recd = &in ;
set stdname point=recd;
name = ' ' || trim(name); /*** add a blank to front of word ***/
name = left( tranwrd(name ,' ' || trim(orginal) || ' ' ,' '||trim(convert)||' ' ));
name = compress( name , '_' ); /*** _ means removes word ***/
name = compbl ( name ); /*** remove multiple adjacent blanks ***/
%end;
is = 0;
%do is = 1 %to &noaddr;
recd = &is ;
set stdaddr point=recd;
street= ' ' || trim(street); /*** add a blank to front of word ***/
street= left( tranwrd(street ,' ' || trim(orginal) || ' ' ,' '||trim(convert)||' ' ));
street= compress( street , '_' ); /*** _ means removes word ***/
street= compbl( street ); /*** remove multiple adjacent blanks ***/
%end;
*** %removedb( name ); /*** remove double letters - may not want to do this***/
*** %dropchar( name, S ); /*** drop the following characters from the ending of words ***/
run;
proc datasets library=work ; delete stdname stdaddr; quit; /*** delete unwanted datasets ***/
%mend stdized;
%stdized; /*** standardize file ***/
%mend stdlist;
%stdlist( master ); /*** Standardize Master File ***/
%stdlist( transact ); /*** Standardize Transaction File ***/
/*******************************************************************/
/*** MEMBER = TOKENIZE.SAS ***/
/*** Tokenize.sas ***/
OPTIONS LS=80 CENTER DATE NUMBER pagesize=60;
filename datafile "c:\download\merge\transact.csv"; run;
DATA TOKEN (KEEP=TOKEN);
length string $ 200 name $ 200 street $ 200 cityst $ 200 zipcode $ 5;
INFILE DATAFILE missover delimiter=';' dsd ;
input name street cityst zipcode ;
string = compress( upcase(name), '.');
DO I = 1 TO 100;
specchar = " ,<.>/?:;'\|[{]}!@#$%^&*()_-+=~`" || '"';
TOKEN = SCAN(STRING,I, ' ' );
IF TOKEN = ' ' THEN GOTO FINISH;
if length(trim(token)) < 2 then goto skip; /*** length < 2 ***/
OUTPUT TOKEN;
skip: ;
END;
FINISH: ;
RUN;
PROC SORT DATA=TOKEN OUT=TOKEN; BY TOKEN; RUN;
DATA TOKEN; SET TOKEN; BY TOKEN;
IF FIRST.TOKEN AND LAST.TOKEN THEN DELETE;
RUN;
proc printto file = "tokenize.lis" new; run;
PROC FREQ DATA=TOKEN; ***ORDER=FREQ;
TABLES TOKEN / MISSING out=freq noprint;
RUN;
proc print data=freq; run;
/*** end of program - tokenize ***/