/**************** * File Name: sirnorm.sas * Programmers: C. Hendricks Brown & Xiange Ling, University of South Florida * Last Modified: 06/10/96 * copyright, all rights reserved, 1996 * * The program is designed to do multiple imputation to fill the missing values * by using the sampling importance resampling (SIR) algorithm, an inefficient * but nevertheless accurate means of handling data which are approximately * normally distributed and "missing at random." * * While efforts have been made to make this program run correctly for * general problems, the authors cannot guarantee correct answers will * be obtained in every problem. If you experience problems with this program, * please send a comment to the first author at chb@yates.coph.usf.edu. * Also, please be aware that newer versions of this program may be made * available at the same site you received this; old versions will not be * maintained. * * To run this program, you must have SAS' BASE, MACRO, and IML modules. * While this program is not guaranteed to run in future SAS releases, * it is likely to run in versions 6.10 and higher. * This program has been tested rigorously on a version of SAS running * under UNIX. It has been tested less on PC-SAS running under the different * MS-Windows operating systems/graphical interfaces. If you have problems * running this program under PC-SAS, check for memory management problems. * * Usage: %sirnorm(InputDataName, OutputDataName, M, K) * This program requires SAS IML, MACRO modules besides the BASE. * (%sirnorm also needs the SAS macro %sirnorm1 which is included * in this source file. Do not use %sirnorm1; it is called automatically * by %sirnorm). * * Input: InputDataName: * The name of the dataset which has the missing values. * (remember to obey SAS' limit of 8 characters for a name;) * The input data MUST BE in SAS dataset format and MUST CONTAIN ONLY * the variables for imputation. * The SAS dataset can either be in permanent or temporary form. * * If the input data are in a temporary SAS dataset, you can just pass your dataset to %sirnorm. * For example: * * %sirnorm(inputDataSetName, OutData, M, K) * * If the input data are in a permanent SAS dataset, you need to put a "libname" statement before * invoking the macro %sirnorm. * For example: * * SAS under the UNIX system * libname inPath "/export/home/yates/ling"; * | * ------the path in quotation is where your dataset is. * %sirnorm(inPath.inputDatasetName, OutData, M, K); * * * SAS under DOS/WINDOWS environment * libname inPath "C:\sirnorm"; * | | * | -----the path in quotation is where the inputdataset is. * | * ----------the the hard drive letter. * %sirnorm(inPath.inputDatasetName, OutData, M, K); * * * OutputDataName: * The output data will have M imputations of your input data with all missing * being replaced by the imputed values. The first full dataset contining imputed * values which fill in the missing values are followed immediately by rows corresponding * to the second full dataset containing different imputed values and so on. Thus the * output data contains M times as many rows as does the input dataset. * There will also be two new variables in the resulting dataset: * _imptn_ indicating the imputation to which the record belongs, and * _obsn_ indicating the original order of the record. * * As with InputDataName, you may also have a temporary or permenant output dataset. * * If you want it be a permenant form, you have to * put a "libname" statement before invoking %sirnorm, specifying where you want to * put the dataset, given that the path has not been specified in a 'libname' statement * before. * Following the example above, we will have: * * SAS under UNIX: * libname inPath "/export/home/yates/ling"; * | * ------the path in quotations is where your dataset is. * libname outPath "/export/Jhu/missing/ling"; * | * ------the path in quotations is where dataset will be. * * %sirnorm(inPath.inputDatasetName, outPath.outputData, M, K); * * SAS under DOS/WINDOWS: * libname inPath "C:\sirnorm"; * | * -----the path in quotations is where the input dataset is. * * * libname outPath "C:\jhu"; * | * -----the path in quotations is where the output dataset will be. * * * %sirnorm(inPath.inputDatasetName, outPath.OutData, M, K); * * * * M: * Number of multiple imputations for the resulting dataset * (often 3 to 5 is sufficient) * K: * Number of initial samplings used for SIR algorithm * (ordinarily 20 times M) * * * Output: * A SAS dataset called OutData of size (M * N) * (p + 2) * containing the original variables and two new variables * called _imptn_, the imputation number (1 through m) * and _obsn_no, the original observation number. * The first N rows correspond to the filled-in * matrix for imputation number 1, the next N rows for * imputation number 2, and so on. * * Comments: You may need to modify the above rules for choosing * k. Larger values of k take more time to run but increase * the accuracy of the random draws. Consider increasing k * at least to the suggested value if you receive a WARNING * message. * * Calculations: This program draws (samples) M * K values from a distribution * which approximates the exact posterior distribution of the * mean and variance-covariance parameters given the incomplete data. * The resampling is done by drawing K of these values from a weighted * distribution where the weights are determined by relative posterior * densities of the exact density and the approximate one used. * For a technical reference, see Rubin, DB (1987), Journal of the American * Statistical Association, vol. 82, 543-546 . */ %macro sirnorm1 (dataset,valsdata,M,K); proc transpose data = &dataset out = varnames; data varnames; set varnames; keep _name_; data __mns__ __cov__; set &valsdata; if _type_ = 'MEAN' or _type_ = 'mean' then output __mns__; if _type_ = 'COV' or _type_ = 'cov' then output __cov__; proc iml; /*options nocenter; */ use varnames; read all var {_name_}; _name_ = _name_`; use &dataset; read all into x; /*reset nocenter;*/ nvars = ncol(x); nobs = nrow(x); p = x; do j=1 to nvars; do i=1 to nobs; if p[i,j]=. then p[i,j]=0; else p[i,j]=1; end; end; q = J(nobs,nvars,1) - p; nbr_obsd = p[,+]; someobsd = loc(nbr_obsd); use __mns__; read all into muhat; use __cov__; read all into sigmahat; evals = eigval(sigmahat); if min(evals) <= 0 then do; /* fix whenever S is non positive definite */ oldmax = max(evals); evecs = eigvec(sigmahat); evals = evals - min(evals) + max(evals) / 10; evals = evals / evals[1,1] * oldmax; sigmahat = evecs * diag(evals) * evecs`; evals = eigval(sigmahat); end; mtimesk = &m * &k; logden_1=J(1,mtimesk,0); effect_n = round(nbr_obsd[+,+] / nvars); /* calculate an effective * sample size */ if effect_n > nvars then effect_n=round(effect_n*0.75); nu = effect_n - 1; if (nu < nvars + 1) then nu = nvars + 10; /* make sure we generate pos def rndm matrix */ totaln = 0; do i=1 to nobs; if ( 0 < nbr_obsd[i,1]) then do; /* some obsns missing */ totaln = totaln + 1; end; end; filldata = J( nobs * &m,nvars,0); imputatn = J( nobs * &m,1,0); obsn_no = J( nobs * &m,1,0); /* Generate random Wishart(nu, inv(nu * sigmahat)) */ chol = half( inv(nu * sigmahat)); allV = J( nvars * mtimesk , nvars, 0); allM = J(mtimesk ,nvars,0); det_sig=det(sigmahat); do sample = 1 to mtimesk; T = normal(J(nvars,nvars,0)); do i = 1 to nvars; thisal = (nu - i + 1) / 2; T[i,i] = sqrt(2 * rangam(0,thisal)); do ll = i+1 to nvars; T[i,ll] = 0; end; end; r=det(t * t`); /* print r; */ W = chol` * T * T` * chol; V = inv(W); /* V has inverse Wishart distribution */ dev_v=det(V); chol2 = half(V); M = muhat + normal(J(1,nvars,0)) * chol2` / sqrt( effect_n); allV[1 + (sample - 1) * nvars : sample * nvars,] = V; allM[sample,] = M; /* Calculating Log Densities of the two posteriors */ logden = 0; lgg=0; lgf=0; do i = 1 to nobs; if ( 0 < nbr_obsd[i,1]) then do; /* some obsns missing */ pi = loc(p[i,]); /* which variables are observed */ thisV = V[pi,pi]; det_thV=det(thisV); thisM = M[1,pi]; logden = logden - 0.5 * log(det(thisV)) + 0.5 * log(det(V)) * effect_n/totaln - 0.5 * (x[i,pi] - thisM) * inv(thisV) * (x[i,pi] - thisM)`; lgf=lgf - .5 * log(det(thisV)) - 0.5 * (x[i,pi] - thisM) * inv(thisV) * (x[i,pi] - thisM)`; end; end; lgg= - 0.5 * effect_n * log(det(V)) - nu/2 * trace( inv(V) * sigmahat) - 1/2 * (M - muhat) * inv(V) * (M - muhat)` ; u1=- 0.5 * effect_n * log(det(V)); u2=- nu/2 * trace( inv(V) * sigmahat); u3=-1/2 * (M - muhat) * inv(V) * (M - muhat)` ; gg_gf_di= lgf - lgg; logden = logden + nu/2 * trace( inv(V) * sigmahat) + 1/2 * (M - muhat) * inv(V) * (M - muhat)` ; logden_1[1,sample]=logden; end; /* end sample */ /*to calculate the mean and logdensity and generate a new matrix which is the difference of original value and mean across all sample */ Max_val=max(logden_1); indicat=J(1, mtimesk,0); max_mat=J(1,mtimesk, Max_val-10); indicat=(logden_1>=max_mat); m_logden=J(1,mtimesk,sum(logden_1#indicat)/sum(indicat)); dummy=sum(indicat); /* number of potential candidates to draw from out of k * m */ lowerlm = &m * 10 ; new_var = ceil(lowerlm / dummy * &k); reset log; if dummy < lowerlm then do; print "WARNING: The sampling number K you chose is likely to be too small, we suggest increasing it to at least: " new_var; end; /* if dummy < lowerlm then do; %put "WARNING: The sampling number K you chose is likely to be too small, please check the output file to find the suggested K"; end;*/ logden_2=(logden_1 - m_logden)#indicat; logden_3=(logden_1 - m_logden); weight=exp(logden_2)#indicat; weight=weight/weight[+,+]; range_w=J(1,mtimesk, 0); range_w[1,1]=weight[1,1]; do sample=2 to mtimesk; range_w[1,sample]=range_w[1,sample-1] + weight[1,sample]; end; /* Now choose one of the samples for each imputation */ do l = 1 to &m; rndm=uniform(0); choice=0; sss=1; flag=0; do while (flag=0); if rndm<=range_w[1,sss] then do; choice=sss; flag=1; end; else do; sss=sss+1; end; end; mu_star = allM[choice,]; start = (choice - 1) * nvars + 1; ending = choice * nvars; V_star = allV[start:ending,]; /* Fill in data given the resampled mean and variance */ do i = 1 to nobs; nvsobsd = p[i,+]; if ( 0 < nvsobsd ) then do; /* some data observed */ if ( nvsobsd < nvars) then do; /* incomplete data */ pi = loc(p[i,]); qi = loc(q[i,]); swept = sweep(V_star,pi); cholA = half(swept[qi,qi]); /* residual variances */ size = ncol(qi); rndm_dat = mu_star[1,qi] + normal(J(1,size,0)) * cholA; indxa = i + (l-1) * nobs; filldata[i + (l - 1) * nobs, pi] = x[i,pi]; filldata[i + (l - 1) * nobs, qi] = rndm_dat + (x[i,pi] - mu_star[1,pi]) * swept[pi,qi]; end; else if ( nvsobsd = nvars) then do; indxb = i + (l-1) * nobs; filldata[i + (l - 1) * nobs , ] = x[i,]; /* all obsd */ /* print indxb; */ end; end; else do; /* all data are missing */ indxc = i + (l-1) * nobs; cholA = half(V_star); filldata[i + (l - 1) * nobs, ] = mu_star[1,] + normal(J(1,nvars,0)) * cholA ; end; imputatn[ i + (l - 1) * nobs, 1] = l; obsn_no[i + (l -1) * nobs, 1] = i; end; end; /* end over m? */ rslt = filldata || imputatn || obsn_no; _allnme_ = _name_ || '_imptn_' ||'_obsn_no'; create _SNORM_ from rslt [colname = _allnme_]; append from rslt; /* proc print data = _SNORM_; run; */ %mend sirnorm1; /*****************************************************************/ %macro sirnorm(ImpData, OutData, M, K); %let ix=0; proc means var noprint data=&ImpData; output out=vart(drop=_type_ _freq_); /* take the variance for each var and output to tmp.var */ run; proc transpose data=vart out=vart; run; data _null_; set &ImpData end=last; if last then call symput('nnn', _n_); run; data vart(keep=_name_ std mmean); set vart; std=col5; mmean=col4; run; /* keep only the std of each variable and name */ proc sort data=vart out=vart; by _name_; run; proc transpose data=&ImpData out=outtr1; run; proc sort data=outtr1 out=outtr1; by _name_; run; data ok; merge vart outtr1; by _name_; run; data ok1(drop=std mmean) ok2(drop=std mmean); set ok; if std > 0 then output ok1; if std = 0 then do; %do kj=1 %to &nnn; col&kj=mmean; %end; output ok2; end; run; /* output variables with nonzero variance to one file * and those with 0 variance to another, as indicated in * following code, the first dataset (file ) is ended with * i,ike ot1114di, which means its original file is out1114d, * and it will be used for imputation. The second one ended * with a 'z', indicates that it contains variables with 0 * variance from dataset out1114d */ data _null_; set ok1 end=last; if last then do; if _n_=1 then call symput('bbb', 1); else call symput('bbb',0); end; run; %let ix=&bbb; /* above code is to create a index to * indicate if dataset ot####di has 0 variable, * if yes, then the index ix####=1, otherwise * ix####=0 */ proc transpose data=ok1 out=ok1(drop=_name_); /*the dataset used for imputation */ proc transpose data=ok2 out=ok2(drop=_name_); /* datasets with 0 variance */ run; %if &ix=1 %then %do; data &OutData; set ok2; run; %end; %else %if &ix=0 %then %do; proc corr cov noprint data=ok1 out = cor_rslt; %sirnorm1(ok1, cor_rslt,&M,&&K); data ok2; set %do kkl=1 %to &M; ok2 %end; ; run; data &OutData; merge ok2 _SNORM_; run; %end; %mend sirnorm;