goptions reset=all; options linesize=137; *********************************************************************; * Histoplt.sas *; * Function: to produce a histogram with a normal parametric *; * distribution curve with an optional additional *; * user specified normal curve *; * Support: SAS Institute Tech Support staff *; * Enhancements: *; *********************************************************************; * Uncomment the line below for debugging *; * *; * options symbolgen macrogen mprint; *; * *; *********************************************************************; *********************************************************************; * First set the range of data for the distribution. *; * If you set the low AND high ranges to 0, then the program will *; * get maximum and minimum values from the data and use these for *; * the data range. *; * *; * Other parameters are also set from this section *; *********************************************************************; data setup; %let lowrange=0; /* lowest bar category and highest bar */ %let hirange= 0; /* if both values are set to 0 then we will */ /* retrieve the low and high range from the */ /* data itself. The ranges allow you to */ /* eliminate outlying data or explore part */ /* of the data easily */ %let barnum=5; /* number of bars in the distribution */ %let ptnum=100; /* number of points drawn on the curve */ %let barcolor='red'; /* color of bars */ %let barpat=solid; /* pattern for the bars */ %let outline='blue'; /* color of the bar outline */ %let lineclr1 =blue; /* color of the curve line */ %let lineclr2 =green; /* color of user supplied distribution line */ %let linesty1=1; /* linestyle of the SAS-generated distribution line */ %let linesty2=2; /* linestyle of user supplied line */ %let curvewid=3; /* thickness of the line drawn on the curve */ %let mean2=30; /* mean of the curve you select vs. the generated mean from the data */ %let std2=10; /* std deviation on the curve you select vs. the generated value from the data */ %let drawline=join; /* draw the computed normal curve 'none' turns off the line */ %let drawlin2=join; /* draw the user specified line--'none' turns off the line */ %let legend=frame; /* draws the specified legend frame */ /* set this to blank if drawline and */ /* drawlin2 are set to none */ %let tailparm=3; /* number of std deviations on the curve, */ /* The higher the value, the more tail on the curve */ %let title1= Histogram with Normal Density Curve; /* title text */ %let tclr= magenta; /* color of the title */ %let theight =2; /* height of the title */ %let tfont=swiss; /* font for the title */ %let label1=Est. density; /* label for computed normal curve */ %let label2=Spec. density; /* label for user specified normal curve off if drawline is 'none' */ %let fnote1a=Est. Mean = ; /* label for estimated mean footnote */ %let fnote1b=Est. Stddev = ; /* label for estimated std footnote */ %let fnote2a=Spec. Mean = ; /* label for user specified mean turned off if drawline is none */ %let fnote2b=Spec. Stddev = ; /* label for the user specified deviation turned off if drawline is none */ run; *********************************************************************; * Read in the data. *; *********************************************************************; data temp1; input xvar @@; cards; 5 10 10 15 15 15 15 15 15 20 20 20 20 20 20 20 25 25 25 25 25 25 25 30 30 30 30 30 35 35 35 40 40 40 ; proc sort; by xvar; ****************************************************************; * Now run PROC MEANS to create data that will be used to *; * generate the normal density curve in the next step. *; ****************************************************************; proc means data=temp1 n var mean min max std sum; var xvar; output out=newmean var=var1 mean=mean1 std=std1 min=min1 max=max1 n=n1; run; *************************************************************************; * Store parameter estimates calculated by PROC MEANS as macro variables *; *************************************************************************; data newmean; set newmean; call symput('topn',n1); call symput('compmean',mean1); call symput('compstd',std1); call symput('compmin',min1); call symput('compmax',max1); proc print data=newmean; run; ********************************************************************; * Generate a normal distribution curve from the data generated *; * from the proc means output *; ********************************************************************; data curve1(keep=x fx min1 max1); set newmean; mu= mean1; /* variance and mean */ first=mu - &tailparm * std1; last=mu + &tailparm * std1; byval=(last-first)/&ptnum; pi=arcos(-1); denom=1/sqrt(2*pi*var1); do x=first to last by byval; fx=denom*exp(-(x-mu)**2/(2* var1)); output; end; run; ********************************************************************; * Generate a normal distribution curve using the *; * user supplied mean and standard deviation *; ********************************************************************; %macro curveit(curve, mu, tailparm, std, ptnum2); data &curve; first=&mu - &tailparm * &std; var1=&std ** 2; last=&mu + &tailparm * &std; byval=(last-first)/&ptnum2; pi=arcos(-1); denom=1/sqrt(2*pi*var1); do x2=first to last by byval; fx2=denom*exp(-(x2-&mu)**2/(2* var1)); output; end; run; %mend curveit; %curveit(curve2, &mean2,&tailparm,&std2, &ptnum); *********************************************************; * Combine the two curve values into one data *; * set that we can use in the GPLOT step *; *********************************************************; ***********************************************************************; * If the DRAWLINE parameter is set to none, then we want to eliminate *; * the second curve. To do this we set values of the second curve *; * equal to the values of the first curve *; ***********************************************************************; data newcurve; merge curve1 (keep=x fx) curve2 (keep = x2 fx2); if "&drawline"="none" then do; call symput('mean',''); call symput('std',''); call symput('label1',''); call symput('fnote1a',''); call symput('fnote1b',''); end; if "&drawlin2"="none" then do; call symput('mean2',''); call symput('std2',''); call symput('label2',''); call symput('fnote2a',''); call symput('fnote2b',''); fx2=fx; x2=x; end; run; ***********************************************************************; * This DATA step sorts the data into groups and sets the bounds *; * for the midpoints of the vertical bars. If the LOWRANGE and *; * HIGHRANGE parameters are set to 0, we use the maximum *; * and minimum values to set the range for the vertical bars. *; * There are 3 DATA steps here because of the macro variables involved.*; ***********************************************************************; data temp2; set temp1; if ((&lowrange=0) and (&hirange=0)) then do; call symput('lowrange',"&compmin"); call symput('hirange',"&compmax"); end; run; data temp2; set temp2; groupint=(&hirange-&lowrange)/(&barnum-1); call symput('groupint',groupint); run; data temp2; set temp2; do i = &lowrange to &hirange by &groupint; subtract=&groupint/2; lowbound =i-subtract; hibound=i+ subtract; if xvar>=lowbound and xvar <=hibound then group=i; end; output; run; data temp2; set temp2; subtract=&groupint/2; lowbound =group-subtract; hibound=group+ subtract; run; ***************************************************************; * Calculate the frequency for each group and *; * get the density value for the bar and the frequency value. *; ***************************************************************; proc freq data=temp2; tables group/out=newgrp1 ; run; data newgrp1; set newgrp1; density= ((count/ &topn) / &groupint); ******************************************************************; * Because some of the midpoints will have no data, we must *; * create bars with zero observations in them to merge with *; * the original dataset so that all group bars are present. *; ******************************************************************; data grp0; do i = &lowrange to &hirange by &groupint; group=i; density=0; count=0; percent=0; output; end; run; data newgrp2; merge grp0 newgrp1; by group; run; *********************************************************; * Create the boundaries for the midpoints for *; * each bar in this step. This controls the number *; * of observations that will be represented by each bar *; *********************************************************; data newgrp3; set newgrp2; by group; subtract=&groupint/2; lowbound =i-subtract; hibound=i+ subtract; run; ***********************************************************************; * The dataset is now converted into ANNOTATE observations which can *; * be used to draw the bars *; ***********************************************************************; data anno3; set newgrp3; length function color style $ 8; xsys='2'; ysys='2'; function='move'; x=lowbound; y=0; output; function='poly'; x=lowbound; y=0; color=&barcolor; style='solid'; output; function='polycont'; x=lowbound; y=density; color=&barcolor; style='solid'; output; function='polycont'; x=hibound; y=density; color=&barcolor; style='solid'; output; function='polycont'; x=hibound; y=0; color=&barcolor; output; style='solid'; output; function='move'; x=lowbound; y=0; output; function='draw'; x=lowbound; y=density; color=&outline; output; function='draw'; x=hibound; y=density; color=&outline; output; function='draw'; x=hibound; y=0; color=&outline; output; function='draw'; x=lowbound; y=0; color=&outline; output; run; ***************************************************************; * We then figure out which is taller--the maximum curve value *; * or the maximum bar value. This is used to determine the *; * vertical axis range for the plot. *; ***************************************************************; proc means data=newgrp2 max; var density; output out=maxgrp max=maxgrp1; run; proc means data=curve1 max; var fx; output out=maxcurve max=maxcrv1; run; proc means data=curve2 max; var fx2; output out=maxcurv2 max=maxcrv2; run; *************************************************************; * This DATA step determines the maximum value (either *; * one of the curve values or the bar density), *; * and then adds .005 and rounds it to the nearest *; * hundredth so that the axis ordering can be in equal *; * values. It does not affect the charting data. *; *************************************************************; data maxaxis; merge maxgrp maxcurve maxcurv2; if (maxgrp1 >= maxcrv1) then maxaxis=maxgrp1; else maxaxis=maxcrv1; if (maxaxis >= maxcrv2) then maxaxis=maxaxis; else maxaxis=maxcrv2; maxaxisr=maxaxis + .005; maxaxisr=round(maxaxisr, .01); call symput('maxaxisr',maxaxisr); run; proc print data=maxaxis; title 'maximum values for group and curve values'; run; ******************************************************************; * We then plot the generated curve data and use the ANNOTATE *; * data set to draw the bars. *; ******************************************************************; proc gplot data=newcurve anno=anno3; Title h=&theight f=&tfont c=&tclr "&title1" ; axis1 order=(0 to &maxaxisr by .01) offset=(0); legend1 position=(top right inside) mode=protect across=1 value=("&label1" "&label2") label=none &legend; plot fx * x fx2 * x2/overlay vaxis=axis1 legend=legend1; symbol1 c=&lineclr1 l=&linesty1 i=&drawline v=none w=&curvewid; symbol2 c=&lineclr2 l=&linesty2 i=&drawlin2 v=none w=&curvewid; footnote1 c=black f=swissl "&fnote1a &compmean &fnote1b &compstd"; footnote2 c=black f=swissl "&fnote2a &mean2 &fnote2b &std2"; run; quit;