*Dendro macro - Version 3 - January 21, 1996 ; * Enhancements include: Landscape format; * Array names changed to allow up to 9999 cases; options nosource2; %macro checkn; /* checks if node is a terminal node */ node=c(i,j); if node>0 then nodestat=1; else nodestat=stat(-node); %mend; %macro findnode; /* traverses the tree until a terminal */ /* node is located */ do while (nodestat=0); prev(-c(i,j))=j; j=-c(i,j); i=1; %checkn end; %mend; %macro fuse; /* increments the count of the number of */ /* fusions, records the order and status */ /* of the node and resets starting point */ /* for the next search */ do; fusions=fusions+1; ordr(j)=fusions; stat(j)=1; j=prev(j); end; %mend; %macro inverty; /* reverses y axis values for plotting */ %if "&order"="reverse" %then %do; y=ymax-y; %end; %mend; %macro genlines; /* generates annotate data set instructions */ /* to draw one fusion in the dendrogram */ if i=1 then do; y=ystart; %inverty; function='move'; output dendro ; y=ht; %inverty; function='draw'; output dendro; end; else do; y=ht; %inverty; function='draw'; output dendro; y=ystart; %inverty; function='draw'; output dendro; end; xcoord(i)=x; %mend; proc format; /* used to determine suitable intervals on y-axis */ value rft 1-<2='1' 2-<5='2' 5-<10='5' 10='10'; run; %macro dendro(format,xlabel,ylabel,labels=down,color=black, axis=black,back=white,text=black); %global nc np proc order ymax ymin inc diff; /* */ /* Purpose: Plots a dendrogram using the OUTTREE data */ /* set created by PROC CLUSTER */ /* */ /* Author: Paul Nicholson, University of Leeds */ /* */ /* Copyright: Paul Nicholson, University of Leeds */ /* */ /* Conditions of use: */ /* */ /* This macro may be used for educational and research */ /* purposes only and may not be used for commercial */ /* gain. */ /* */ /* Usage: */ /* */ /* (i) Run PROC CLUSTER or PROC VARCLUS specifying */ /* OUTTREE=TREE to save the cluster schedule */ /* in a data set called TREE. */ /* (ii) %INCLUDE dendro; */ /* (iii) %dendro; */ /* */ /* */ /* Parameter definitions */ /* */ /* Parameter Values Action */ /* format portrait vertical display */ /* landscape horizontal display */ /* xlabel 'text' label for x-axis (values axis) */ /* ylabel 'text' label for y-axis (distance axis) */ /* labels down plots x-axis labels vertically */ /* across plots x-axis labels horizontally */ /* */ /* color any determines colour of lines */ /* */ /* axis any determines colour of axes */ /* */ /* back any determines colour of background */ /* */ /* text any determines colour of text */ /* */ /* Take a copy of the tree data set. This allows macro */ /* to be re-executed without re-running cluster step */ data tree2; set tree; run; data heights(keep=nc _height_) formatds(keep=fmtname start label); /* pick up the distances at which each fusion is formed */ length label start fmtname $ 8; retain n nc varclus 0 last 0; retain ymin 99999 ymax -99999 minid maxid; retain fmtname '$clnamft'; set tree2 end=eof; if _n_ = 1 then do; /* detect the procedure used to create the tree dataset */ if substr(_name_,1,5) = 'CLUS1' then do; varclus=1; call symput('proc','varclus'); end; else do; n=_ncl_; nc=input(substr(_parent_,3,4),4.); if _height_ = 0.0 then ymin=0.0; call symput('nc',left(put(nc,4.))); call symput('proc','cluster'); end; end; if varclus = 0 then do; if _ncl_ < n; nc=input(substr(_name_,3,4),4.); actualnc+1; output heights; end; else do; ch=substr(_name_,5,1); digit=indexc('0123456789',ch); if substr(_name_,1,4) = 'CLUS' and digit > 0 then do; nc=_ncl_; actualnc+1; _height_=_propor_; output heights; start=_name_; label=left(put(_ncl_,4.)); output formatds; end; else do; last=1; call symput('nc',left(put(nc,4.))); end; end; if last = 0 then do; if _height_ > ymax then do; ymax=_height_; maxid=_n_; end; if _height_ < ymin then do; ymin=_height_; minid=_n_; end; end; if eof = 1 or last = 1 then do; call symput('actualnc',left(put(actualnc,4.))); call symput('ymax',left(put(ymax,best12.))); call symput('ymin',left(put(ymin,best12.))); if varclus = 0 and maxid > minid then call symput('order','normal'); else call symput('order','reverse'); end; run; proc sort data=heights; by descending nc; run; /* create macro variable NP (no. of observations) */ %let np=%eval(&nc+1); /* adjust nc (if disjoint clusters exist */ %let diff=%eval(&nc-&actualnc); %if %eval(&diff) > 0 %then %let nc=%eval(&actualnc); %if &proc=varclus %then %do; proc format cntlin=formatds; run; proc sort data=tree2; by descending _parent_; run; %end; data clusters (keep=nc clno1 clno2) formatds (keep=fmtname start label); /* pick up the node numbers for each fusion */ length c1 c2 label $ 16 fmtname $ 8 clusid $ 4; length nc clno1 clno2 start 8; array cl(2) c1 c2; array clusno(2) clno1 clno2; retain ic 0 fmtname 'nodeft' pos 3 clusid 'CL '; /* reset cluster flags if tree created by VARCLUS */ if _n_=1 and "&proc"="varclus" then do; pos=5; clusid='CLUS'; end; /* process two records pertaining to fusion */ do i=1 to 2; set tree2; if _parent_ = '' then stop; /* store the cluster name - use left to */ /* eliminate spaces that exist in numeric */ /* IDs. */ cl(i)=left(_name_); /* pick up the number of the parent node */ %if "&proc"="cluster" %then %do; nc=input(substr(_parent_,3,4),4.); %end; %else %do; nc=input(left(put(_parent_,$clnamft.)),4.); %end; /* establish node number as numeric to facilitate */ /* implementation of the tree-traversal algorithm */ /* flag clusters by negating the node number */ /* build format for re-attachment of node IDs later */ ch=substr(cl(i),pos,1); digit=indexc('0123456789',ch); if substr(cl(i),1,pos-1) eq substr(clusid,1,pos-1) and digit > 0 then do; %if "&proc"="cluster" %then %do; clusno(i)=-input(substr(cl(i),3,4),4.); %end; %else %do; clusno(i)=-input(left(put(cl(i),$clnamft.)),4.); %end; end; else do; ic=ic+1; clusno(i)=ic; start=ic; label=cl(i); output formatds; end; end; output clusters; run; /* Use PROC FORMAT to create format variable nodeft */ /* from the control data set , formatds. */ proc format cntlin=formatds; run; %if &proc=varclus %then %do; proc sort data=clusters; by descending nc; run; %end; /* combine the distances with the node numbers */ data treedata(keep=nc _height_ clno1 clno2); merge clusters heights; by descending nc; /* adjust cluster numbers to range from 1 if */ /* disjoint clusters have been detected */ %if %eval(&diff)>0 %then %do; nc=nc-%eval(&diff); if clno1<0 then clno1=clno1+%eval(&diff); if clno2<0 then clno2=clno2+%eval(&diff); %end; run; proc sort; by nc; run; /* transpose the data to facilitate building of arrays */ proc transpose data=treedata out=treetran; run; /* build arrays used by tree traversal algorithm */ data arrays; array ncl(j) nc1-nc&nc; array c1(j) acl1-acl&nc ; array c2(j) bcl1-bcl&nc; array height(j) h1-h&nc; array ordr(j) ordr1-ordr&nc; array prev(j) prev1-prev&nc; array stat(j) stat1-stat&nc; array col(j) col1-col&nc; array super(k) ncl c1 c2 height; do k=1 to 4; set treetran; do j=1 to &nc; super=col; end; end; do j=1 to &nc; stat=0; ordr=0; prev=0; end; output; drop col1-col&nc j k _name_; stop; run; /* Tree traversal step */ /* The search begins at the top of the tree and progresses */ /* downwards until a terminal node is found. The tree is */ /* then traversed backwards to the parent node from which */ /* a search for a second terminal node is made. Once the */ /* second node has been found, the algorithm returns to the */ /* pair of nodes of which the parent node is one and */ /* attempts to fuse those. If fusion is not possible, the */ /* search continues as before in a downward direction. */ data ordernos(keep=acl1-acl&nc bcl1-bcl&nc nc1-nc&nc h1-h&nc ordr1-ordr&nc); array c(2,&nc) acl1-acl&nc bcl1-bcl&nc; array ncl(&nc) nc1-nc&nc; array height(&nc) h1-h&nc; array ordr(&nc) ordr1-ordr&nc; array prev(&nc) prev1-prev&nc; array stat(&nc) stat1-stat&nc; set arrays; fusions=0; i=1; j=1; prev(1)=0; %checkn do while (fusions<&nc); /* if disjoint clusters exist, find start of new sub-tree */ if j = 0 then do; do j=1 to &nc until(nodestat=0); nodestat=stat(j); end; prev(j)=0; i=1; end; select (nodestat); /* if its a fusion node with inferior */ /* terminal nodes as yet not reached */ when (0) do; %findnode i=2; %checkn if nodestat=1 then %fuse end; /* if its a terminal node */ when (1) do; i=2; /* find the status of its partner */ %checkn /* if its a terminal node, then fuse */ if nodestat=1 then %fuse else do; /* else, continue search until one found */ %findnode i=2; /* and then perform similar attempt at fusion */ /* Note that failure to fuse here leaves the */ /* starting position for a search set correctly */ /* for the return to the top of the DO WHILE loop */ %checkn if nodestat=1 then %fuse end; end; end; end; output; stop; run; /* Transpose the contents of the arrays to form */ /* a data set containing the original data on nodes */ /* together with the variable ord indicating the */ /* order in which the fusion is to be processed */ data ordtree(keep=nc clno1 clno2 ht ord); length h1-h&nc nc1-nc&nc ordr1-ordr&nc 8; array c1(&nc) acl1-acl&nc; array c2(&nc) bcl1-bcl&nc; array height(&nc) h1-h&nc; array ncl(&nc) nc1-nc&nc; array ordr(&nc) ordr1-ordr&nc; set ordernos; do j=1 to &nc; nc=ncl(j); clno1=c1(j); clno2=c2(j); ht=height(j); ord=ordr(j); output; end; stop; run; /* sort in ascending oder of ord */ proc sort; by ord; run; /* Generate the annotate data set to draw the */ /* dendrogram */ data dendro(keep=function x y xsys ysys color) formatds(keep=fmtname start label) formaty(keep=fmtname start ylabel rename=(ylabel=label)); length function color fmtname start $ 8 xsys ysys $ 1; length label ylabel $ 16; array clx(&nc) cx1-cx&nc; array cly(&nc) cy1-cy&nc; array cl(2) clno1 clno2; array height(2) ht1 ht2; array xcoord(2) x1 x2; retain cx1-cx&nc cy1-cy&nc ipt shiftl 0 ymax ymin inc; retain xsys ysys '2' color "&color"; if _n_ = 1 then do; /* determine a suitable increment for y axis */ ymax=input(symget('ymax'),best12.); ymin=input(symget('ymin'),best12.); r=ymax-ymin; p=floor(log10(r)); k=10**(p-1); inc=input(put(ceil(r/k)*0.1,rft.),2.)*k; call symput('inc',left(put(inc,best12.))); /* adjust limits if necessary and re-set macro variables */ if r-int(r/inc)*inc > 0 then do; ymin=floor(ymin/inc)*inc; call symput('ymin',left(put(ymin,best12.))); newr=ymax-ymin; if newr-int(newr/inc)*inc > 0 then do; ymax=ceil(ymax/inc)*inc; call symput('ymax',left(put(ymax,best12.))); end; end; /* if y values are inverted, generate format for y axis */ %if "&order"="reverse" %then %do; ylim=ymax-ymin; call symput('ymin',put(0,1.)); call symput('ymax',left(put(ylim,best12.))); fmtname='yft'; do y=ymax to ymin by -inc; start=left(put(ymax-y,best12.)); ylabel=left(put(y,best12.)); output formaty; end; %end; end; set ordtree; clusters=0; do i=1 to 2; /* if its a terminal node (but not a fusion point) */ /* assign it as the next point on the x-axis and */ /* use format variable nodeft., created earlier, */ /* to re-assign label to the assigned value of x. */ /* Then generate plot instructions. */ if cl(i) > 0 then do; ipt+1; start=left(put(ipt,5.)); label=put(cl(i),nodeft.); fmtname='labelft'; output formatds; x=ipt; %if "&order"="normal" %then %do; ystart=ymin; %end; %else %do; ystart=ymax; %end; height(i)=ht; %genlines end; /* if its a fusion point, pick up its co-ordinates */ /* from the arrays clx and cly and generate plot */ /* instructions. */ else do; clusters=clusters+1; clpt=-cl(i); x=clx(clpt); ystart=cly(clpt); height(i)=ystart; %genlines end; end; /* after processing this pair, store co-ordinates */ /* of the new fusion node. If two nodes fuse at */ /* the same height and one is a cluster, apply a */ /* correction factor to centre verticals. */ /* (applies to density linkage) */ if clusters=1 and ht1=ht2 then shiftl=shiftl+0.25; else shiftl=0; clx(nc)=(x1+x2)/2-shiftl; cly(nc)=ht; run; /* interchange x and y if landscape format is required */ %if %upcase(&format)=LANDSCAPE %then %do; data dendro(rename=(x=y y=x)); set dendro; run; %end; /* Use PROC FORMAT to create format variable labelft */ /* from the control data set , formatds. */ proc format cntlin=formatds; run; %if "&order"="reverse" %then %do; proc format cntlin=formaty; run; %end; goptions cback=&back ctext=&text vpos=60 hpos=120; /* plot dendrogram using Annotate data set */ proc gplot data=dendro; axis1 order=1 to &np by 1 minor=none %if %upcase(&labels)=DOWN & %upcase(&format) NE LANDSCAPE %then %do; value=(angle=-90 rotate=90 height=1 font=swiss) %end; %else %do; value=(height=1 font=swiss) %end; %if &xlabel= %then %do; label=none; %end; %else %do; %if %upcase(&format) ne LANDSCAPE %then %do; label=(height=2 font=swiss "&xlabel"); %end; %else %do; label=(angle=90 height=2 font=swiss "&xlabel"); %end; %end; axis2 order = &ymin to &ymax by &inc minor = none value = (height=1 font=swiss) %if &ylabel= %then %do; %if %upcase(&format) ne LANDSCAPE %then %do; label=(angle=90 height=2 font=swiss "Distance"); %end; %else %do; label=(height=2 font=swiss "Distance"); %end; %end; %else %do; %if %upcase(&format) ne LANDSCAPE %then label=(angle=90 height=2 font=swiss "&ylabel"); %else label=(height=2 font=swiss "&ylabel"); %end; %if %upcase(&format) ne LANDSCAPE %then %do; plot y*x/haxis=axis1 vaxis=axis2 %end; %else %do; plot x*y/haxis=axis2 vaxis=axis1 %end; caxis=&axis annotate=dendro; symbol1 v=none; /* suppresses plot symbols */ footnote1 angle=-90 ' '; %if %upcase(&format) ne LANDSCAPE %then %do; format x labelft.; %end; %else %do; format y labelft.; %end; %if "&order"="reverse" %then %do; %if %upcase(&format) ne LANDSCAPE %then %do; format y yft.; %end; %else %do; format x yft.; %end; %end; run; %mend dendro; options source2;