goptions reset=all;
options linesize=137;
*********************************************************************;
* Histoplt.sas *;
* Function: to produce a histogram with a normal parametric *;
* distribution curve with an optional additional *;
* user specified normal curve *;
* Support: SAS Institute Tech Support staff *;
* Enhancements: *;
*********************************************************************;
* Uncomment the line below for debugging *;
* *;
* options symbolgen macrogen mprint; *;
* *;
*********************************************************************;
*********************************************************************;
* First set the range of data for the distribution. *;
* If you set the low AND high ranges to 0, then the program will *;
* get maximum and minimum values from the data and use these for *;
* the data range. *;
* *;
* Other parameters are also set from this section *;
*********************************************************************;
data setup;
%let lowrange=0; /* lowest bar category and highest bar */
%let hirange= 0; /* if both values are set to 0 then we will */
/* retrieve the low and high range from the */
/* data itself. The ranges allow you to */
/* eliminate outlying data or explore part */
/* of the data easily */
%let barnum=5; /* number of bars in the distribution */
%let ptnum=100; /* number of points drawn on the curve */
%let barcolor='red'; /* color of bars */
%let barpat=solid; /* pattern for the bars */
%let outline='blue'; /* color of the bar outline */
%let lineclr1 =blue; /* color of the curve line */
%let lineclr2 =green; /* color of user supplied distribution line */
%let linesty1=1; /* linestyle of the SAS-generated
distribution line */
%let linesty2=2; /* linestyle of user supplied line */
%let curvewid=3; /* thickness of the line drawn on the curve */
%let mean2=30; /* mean of the curve you select vs. the
generated mean from the data */
%let std2=10; /* std deviation on the curve you select vs.
the generated value from the data */
%let drawline=join; /* draw the computed normal curve
'none' turns off the line */
%let drawlin2=join; /* draw the user specified line--'none'
turns off the line */
%let legend=frame; /* draws the specified legend frame */
/* set this to blank if drawline and */
/* drawlin2 are set to none */
%let tailparm=3; /* number of std deviations on the curve, */
/* The higher the value, the more tail on
the curve */
%let title1= Histogram with Normal Density Curve; /* title text */
%let tclr= magenta; /* color of the title */
%let theight =2; /* height of the title */
%let tfont=swiss; /* font for the title */
%let label1=Est. density; /* label for computed normal curve */
%let label2=Spec. density; /* label for user specified normal curve
off if drawline is 'none' */
%let fnote1a=Est. Mean = ; /* label for estimated mean footnote */
%let fnote1b=Est. Stddev = ; /* label for estimated std footnote */
%let fnote2a=Spec. Mean = ; /* label for user specified mean
turned off if drawline is none */
%let fnote2b=Spec. Stddev = ; /* label for the user specified deviation
turned off if drawline is none */
run;
*********************************************************************;
* Read in the data. *;
*********************************************************************;
data temp1;
input xvar @@;
cards;
5 10 10 15 15 15 15 15 15 20 20 20 20 20 20 20 25
25 25 25 25 25 25 30 30 30 30 30 35 35 35 40 40 40
;
proc sort; by xvar;
****************************************************************;
* Now run PROC MEANS to create data that will be used to *;
* generate the normal density curve in the next step. *;
****************************************************************;
proc means data=temp1 n var mean min max std sum;
var xvar;
output out=newmean var=var1 mean=mean1 std=std1 min=min1 max=max1 n=n1;
run;
*************************************************************************;
* Store parameter estimates calculated by PROC MEANS as macro variables *;
*************************************************************************;
data newmean; set newmean;
call symput('topn',n1);
call symput('compmean',mean1);
call symput('compstd',std1);
call symput('compmin',min1);
call symput('compmax',max1);
proc print data=newmean; run;
********************************************************************;
* Generate a normal distribution curve from the data generated *;
* from the proc means output *;
********************************************************************;
data curve1(keep=x fx min1 max1); set newmean;
mu= mean1; /* variance and mean */
first=mu - &tailparm * std1;
last=mu + &tailparm * std1;
byval=(last-first)/&ptnum;
pi=arcos(-1);
denom=1/sqrt(2*pi*var1);
do x=first to last by byval;
fx=denom*exp(-(x-mu)**2/(2* var1));
output;
end;
run;
********************************************************************;
* Generate a normal distribution curve using the *;
* user supplied mean and standard deviation *;
********************************************************************;
%macro curveit(curve, mu, tailparm, std, ptnum2);
data &curve;
first=&mu - &tailparm * &std;
var1=&std ** 2;
last=&mu + &tailparm * &std;
byval=(last-first)/&ptnum2;
pi=arcos(-1);
denom=1/sqrt(2*pi*var1);
do x2=first to last by byval;
fx2=denom*exp(-(x2-&mu)**2/(2* var1));
output;
end;
run;
%mend curveit;
%curveit(curve2, &mean2,&tailparm,&std2, &ptnum);
*********************************************************;
* Combine the two curve values into one data *;
* set that we can use in the GPLOT step *;
*********************************************************;
***********************************************************************;
* If the DRAWLINE parameter is set to none, then we want to eliminate *;
* the second curve. To do this we set values of the second curve *;
* equal to the values of the first curve *;
***********************************************************************;
data newcurve; merge curve1 (keep=x fx) curve2 (keep = x2 fx2);
if "&drawline"="none" then do;
call symput('mean','');
call symput('std','');
call symput('label1','');
call symput('fnote1a','');
call symput('fnote1b','');
end;
if "&drawlin2"="none" then do;
call symput('mean2','');
call symput('std2','');
call symput('label2','');
call symput('fnote2a','');
call symput('fnote2b','');
fx2=fx;
x2=x;
end;
run;
***********************************************************************;
* This DATA step sorts the data into groups and sets the bounds *;
* for the midpoints of the vertical bars. If the LOWRANGE and *;
* HIGHRANGE parameters are set to 0, we use the maximum *;
* and minimum values to set the range for the vertical bars. *;
* There are 3 DATA steps here because of the macro variables involved.*;
***********************************************************************;
data temp2; set temp1;
if ((&lowrange=0) and (&hirange=0)) then do;
call symput('lowrange',"&compmin");
call symput('hirange',"&compmax");
end;
run;
data temp2; set temp2;
groupint=(&hirange-&lowrange)/(&barnum-1);
call symput('groupint',groupint);
run;
data temp2; set temp2;
do i = &lowrange to &hirange by &groupint;
subtract=&groupint/2;
lowbound =i-subtract;
hibound=i+ subtract;
if xvar>=lowbound and xvar <=hibound then group=i;
end;
output;
run;
data temp2; set temp2;
subtract=&groupint/2;
lowbound =group-subtract;
hibound=group+ subtract;
run;
***************************************************************;
* Calculate the frequency for each group and *;
* get the density value for the bar and the frequency value. *;
***************************************************************;
proc freq data=temp2;
tables group/out=newgrp1 ;
run;
data newgrp1; set newgrp1;
density= ((count/ &topn) / &groupint);
******************************************************************;
* Because some of the midpoints will have no data, we must *;
* create bars with zero observations in them to merge with *;
* the original dataset so that all group bars are present. *;
******************************************************************;
data grp0;
do i = &lowrange to &hirange by &groupint;
group=i;
density=0;
count=0;
percent=0;
output;
end;
run;
data newgrp2; merge grp0 newgrp1;
by group;
run;
*********************************************************;
* Create the boundaries for the midpoints for *;
* each bar in this step. This controls the number *;
* of observations that will be represented by each bar *;
*********************************************************;
data newgrp3; set newgrp2; by group;
subtract=&groupint/2;
lowbound =i-subtract;
hibound=i+ subtract;
run;
***********************************************************************;
* The dataset is now converted into ANNOTATE observations which can *;
* be used to draw the bars *;
***********************************************************************;
data anno3; set newgrp3;
length function color style $ 8;
xsys='2'; ysys='2';
function='move'; x=lowbound; y=0; output;
function='poly'; x=lowbound; y=0; color=&barcolor; style='solid';
output;
function='polycont'; x=lowbound; y=density; color=&barcolor;
style='solid'; output;
function='polycont'; x=hibound; y=density; color=&barcolor;
style='solid'; output;
function='polycont'; x=hibound; y=0; color=&barcolor; output;
style='solid'; output;
function='move'; x=lowbound; y=0; output;
function='draw'; x=lowbound; y=density; color=&outline; output;
function='draw'; x=hibound; y=density; color=&outline; output;
function='draw'; x=hibound; y=0; color=&outline; output;
function='draw'; x=lowbound; y=0; color=&outline; output;
run;
***************************************************************;
* We then figure out which is taller--the maximum curve value *;
* or the maximum bar value. This is used to determine the *;
* vertical axis range for the plot. *;
***************************************************************;
proc means data=newgrp2 max;
var density;
output out=maxgrp max=maxgrp1;
run;
proc means data=curve1 max;
var fx;
output out=maxcurve max=maxcrv1;
run;
proc means data=curve2 max;
var fx2;
output out=maxcurv2 max=maxcrv2;
run;
*************************************************************;
* This DATA step determines the maximum value (either *;
* one of the curve values or the bar density), *;
* and then adds .005 and rounds it to the nearest *;
* hundredth so that the axis ordering can be in equal *;
* values. It does not affect the charting data. *;
*************************************************************;
data maxaxis; merge maxgrp maxcurve maxcurv2;
if (maxgrp1 >= maxcrv1) then maxaxis=maxgrp1;
else maxaxis=maxcrv1;
if (maxaxis >= maxcrv2) then maxaxis=maxaxis;
else maxaxis=maxcrv2;
maxaxisr=maxaxis + .005;
maxaxisr=round(maxaxisr, .01);
call symput('maxaxisr',maxaxisr);
run;
proc print data=maxaxis;
title 'maximum values for group and curve values';
run;
******************************************************************;
* We then plot the generated curve data and use the ANNOTATE *;
* data set to draw the bars. *;
******************************************************************;
proc gplot data=newcurve anno=anno3;
Title h=&theight f=&tfont c=&tclr "&title1" ;
axis1 order=(0 to &maxaxisr by .01) offset=(0);
legend1 position=(top right inside) mode=protect across=1
value=("&label1" "&label2") label=none &legend;
plot fx * x fx2 * x2/overlay vaxis=axis1 legend=legend1;
symbol1 c=&lineclr1 l=&linesty1 i=&drawline v=none w=&curvewid;
symbol2 c=&lineclr2 l=&linesty2 i=&drawlin2 v=none w=&curvewid;
footnote1 c=black f=swissl
"&fnote1a &compmean &fnote1b &compstd";
footnote2 c=black f=swissl
"&fnote2a &mean2 &fnote2b &std2";
run;
quit;