/*This SAS macro performs latent class analysis (LCA) for data
  that involve four binary manifest variables
  and a single latent class with two values. 

Feel free to contact me for further information on this project.

Dave Thompson
Assistant Professor
Department of Biostatistics and Epidemiology
University of Oklahoma Health Sciences Center
OUHSC campus address: CHB 321
phone: 405-271-2229, ext. 48054
Fax: 405-271-2068
Email: dave-thompson@ouhsc.edu

Mailing address:
P.O. Box 26901
Oklahoma City, OK, 73190 

/*LOAD MACRO lcacatmod BEFORE calling the macro.  The macro variable indata
refers to a dataset that includes one or more unconditional tables (where 
the variable X is latent and unassigned.*/
%lcacatmod (sim.sampledatarep25)
;

%macro lcacatmod (indata, meth=wls, reps=1000, maxiter=40, intreps=200);
/*establish libraries for input and output*/
options nodate nonumber;
libname sim 'W:\Latent variable research';

/*Send log to external file to avoid filling log window*/
filename logfile 
  "C:\Documents and Settings\dthompso\Desktop\logfile.lst";
Proc printto log=logfile;
run;

/*BEGIN EXAMINATION OF SIMULATED DATASETS
  Execute program for each repetition in data work.sim*/
/*  %do repet=1 %to &reps; */
  /*Alternatively execute program for a particular group of repeated simulations*/
%do repet=25 %to 25;

/*read dataset &indata, select the appopriate simulated rep*/
data cull sum;
  set &indata (where=(rep=&repet)) end=last;
  totaln+freq;		/*obtain total N in input dataset*/
  output cull;
  if last then output sum;  /*report total N in input dataset*/
run;

/*BEGIN INTERNAL REPETITION LOOP.
  Execute program multiple times for each simulated data set*/
%do intrep=1 %to %eval(&intreps); 

/*DATA TWO creates "complete" frequency table by randomly assigning 
  predicted counts (co), within each a*b*c*d response profile, 
	to one of two latent classes x.  

  The DATA TWO step includes lines that ensure that no nonzero
  counts occur in the variable 'co'.  This feature may improve CATMOD's estimation
  of parameters for the latent class model.  I can investigate whether it hurts
  to present CATMOD with zero counts, even when a dataset like STOUFFER includes
  observations for all possible response profiles

  The following macro %DO sequence was inserted on 8-23-2005 
  to avoid iteration problems found empirically to occur 
  with initial estimates of latent class prevalence less than 0.3.
  The loop delays the outputting of WORK.TWO until 
  the initial estimate for p(x=1) is sufficiently high 
  to ensure iteration to a nontrivial solution.*/

%let initpx1=0.1;
%let try=1;
%do %until (%sysevalf(&initpx1)>0.3);  
data two;
  if _n_=1 then set sum (keep=totaln);	/*merge in number of total obs*/
  set cull (drop=totaln);
  m=round(ranuni(0),1);  /*arbitrarily assign latent class to each profile*/
  x=m+1;				 /*assign latent class 1 or 2*/
  co=freq*m + 1e-5;		 /*create variable 'co', avoiding zero counts*/
  expect=.;
  if x=1 then x1obs+freq;	/*accumulate number of obs in LC x=1*/
  pppx1=x1obs/totaln;		/*update estimate of p(x=1)*/
  call symput('initpx1',pppx1); /*update value of macro variable at end of data step*/
run;
%put simulated px1 in data two after try &try  = &initpx1;
%let try=%eval(&try+1);
%end;

/*Begin iterations that estimate latent class model parameters*/
%let iter=1;
/*Discontinue iterations when the maximum number of iterations is reached,
  or when a preset tolerance is reached, e.g. 0.001,
  or when the program fails to iterate toward a solution
  or when the estimate of p(x=1) is suspiciously low after the first iteration*/
%let itdiff=0.5;
%do %until ((&iter=%eval(&maxiter+1))  
			or (&iter ge 3 and 0<%sysevalf(&itdiff) and %sysevalf(&itdiff)<0.001)
			or (&iter=4 and %sysevalf(&itdiff)=0)
			or (&iter=2 and %sysevalf(&it1px1)<0.3) );
%put simulated: Begin iter &iter (rep &repet internal rep &intrep iteration &iter &itdiff);

/*Submit updated frequency table to CATMOD, which, using a model statement that 
	assumes local independence, performs a maximization step.
  Parameter estimates produced by CATMOD are passed to a series of datasteps
  that update estimates of conditional probabilities, latent class prevalences,
  and joint probabilities*/
ods listing close;									/*temporarily turn off listing*/
ods output anova=mlr MaxLikelihood=iters estimates=mu covb=covb;
	/*send LR statistic output to work.mlr and send number of iters to work.iters*/
proc catmod data=two order=data;
  weight co;

	%if &meth=nr %then %do;
	  model a*b*c*d*x = _response_ / ml=nr covb ;
	%end;

	%if &meth=ipf %then %do;
	  model a*b*c*d*x = _response_ / ml=ipf (parm) zero=samp missing=samp covb;
	%end;

	%if &meth=wls %then %do;
		model a*b*c*d*x = _response_ / wls addcell=.5 covb;
	%end;

  loglin a b c d x a*x b*x c*x d*x;
	title "catmod output for iteration #&iter";
run;
quit;
ods listing;						/*reset SAS listing*/

/*obtain vector lambda of updated parameter estimates 
  and vector sigma of parameter standard errors */
data lambdas;
  do k=1 to last;
	  set mu point=k nobs=last;
    array s [9] sigma1-sigma9;
  	array m [9] lambda1-lambda9;
	  m[k]=estimate;
	  s[k]=stderr;
	  if k=9 then output;
	end;
keep lambda1-lambda9 sigma1-sigma9;
stop;
run;

/*	Parameter estimates from CATMOD's log-linear model are read into an array,
    then used to calculate conditional probabilities and latent class prevalences,
	after Heinen, 1996,p. 53, equation 2.17. 
 
		CATMOD's design matrix marks a 0 response with a 1, and a 1 response with a -1.

 		Latent class prevalences are similarly obtained from 
		the log-linear model's estimated parameters for X.

		Joint probabilities (piabx) are the product of conditional probabilities
		and latent class prevalences.*/
data four;
  set lambdas;
	array mu [9] lambda1-lambda9;			/*vector of CATMOD's loglinear param estimates*/
	array vars [4] a b c d;					/*vector of var names*/
	array p [10] pa_x1 pb_x1 pc_x1 pd_x1 	/*vector of conditional and LC probs*/
  						 pa_x2 pb_x2 pc_x2 pd_x2
  						 px1 px2;		
  array pjoint [2] piabcdx1 piabcdx2;		/*array of joint probs*/	
	do a=0 to 1;
	  do b=0 to 1;
		  do c=0 to 1;
			  do d=0 to 1;
					do x=1 to 2;
						do var=1 to 4;
							value=vars[var];

				/*conditional probs for latent classes 1 and 2*/
	      p[4*(x-1)+var] = exp(mu[var]*(-1)**value + mu[var+5]*(-1)**(value+x))
	 										 /(exp(mu[var]*(-1)**value + mu[var+5]*(-1)**(value+x))  
											+ exp(-mu[var]*(-1)**value - mu[var+5]*(-1)**(value+x)));

				/*latent class probabilities*/
				p[8+x]= exp(mu[5]*(-1)**(x-1))
   					/ (exp(mu[5]*(-1)**(x-1)) + exp(-mu[5]*(-1)**(x-1)));

				/*joint probabilities for each class*/
				pjoint[x]=p[4*x-3]*p[4*x-2]*p[4*x-1]*p[4*x]*p[8+x];
		
				/*predicted response probabilities across both classes*/
				piabcd=piabcdx1+piabcdx2;

				/*Allocation probabilities pix1_ab are "posterior" probabilities,
				the prob of being in latent class X=t, given A and B*/
				pix1_abcd=piabcdx1/piabcd;
				pix2_abcd=piabcdx2/piabcd;

				if x=2 and var=4 then output;
								end;
						end;
					end;
			end;
		end;
	end;
drop lambda1-lambda9 value var x;
call symput('it1px1',px1); /*update value of macro variable at end of data step*/
run;

/*grab and keep parameter estimates from this iteration, and append to 
  data work.stopcrit, so that stopping criterion can be determined*/
data paramsnow;
  set four (where=(a=1 and b=1 and c=1 and d=1));	
	iteration=&iter;
	rep=&repet;
	keep iteration rep px1 px2 
		pa_x1 pb_x1 pc_x1 pd_x1 	
  		pa_x2 pb_x2 pc_x2 pd_x2;
run;
proc append data=paramsnow base=stopcrit;
run;

/*Calculate dataset variable 'itdiff,' 
  the average absolute difference 
  in parameter estimates between successive iterations,
  and use the value to update the macro variable itdiff.*/
data itcrit; 
	set stopcrit;
/*vector of parameters*/
	array p [*] pa_x1 pb_x1 pc_x1 pd_x1 	
  				 pa_x2 pb_x2 pc_x2 pd_x2
  				 px1 px2;
/*vector of lagged parameters, for calculating stopping criteria*/
	array pl [*] pa_x1lag pb_x1lag pc_x1lag pd_x1lag 	
  				 pa_x2lag pb_x2lag pc_x2lag pd_x2lag
  				 px1lag px2lag;
/*Obtain absolute difference between each parameter estimate
  and its lagged value.*/
	diffs=0;
	do k=1 to dim(p);
		pl[k]=lag(p[k]);
/*sum, then average, the absolute diffs among p parameters*/
		diffs+abs(p[k]-pl[k]);	
		itdiff=diffs/dim(p);
		if k=dim(p) then output;
	end;
/*update macro variable itdiff from dataset variable*/
call symput('itdiff', itdiff);
run;
%put simulated after rep &repet (intrep &intrep iteration &iter): px1=&it1px1 and itdiff=&itdiff;

/*Combine parameter estimates with observed data,
	then renew predicted cell counts, by x, 
	by multiplying observed counts with allocation probabilities pix_ab*/
proc sort data=four;
  by a b c d;
proc sort data=two;
  by a b c d;
data five;
	if _n_=1 then set sum (keep=totaln);
  merge two four;
	by a b c d;
	iteration=&iter;
	expect=piabcd*totaln;					/*expected frequencies*/
	if x=1 then co=pix1_abcd*freq;			/*predicted frequencies*/
	if x=2 then co=pix2_abcd*freq;
	if co=0 then co=1e-5;
			/*repels cell count predictions from 0 boundary. Not needed when method=wls,
			as addcell option in CATMOD'S model statement covers this problem*/
	keep iteration a b c d x freq co expect /*true*/;
run;

/*Update data work.two*/
data two;
  set five;
run;

/*Maintain a log file for number of internal CATMOD iterations
on this step of the algorithm*/
%if &meth=wls %then %do;
	data ithist;
  	sasiters=1;
	run;
%end;
%else %do;
proc means noprint max data=iters;
  var iteration;
  output out=ithist(drop=_type_ _freq_) max=sasiters;
run;
%end;

/*Calculate and keep a Pearson chi square,
  a statistic that compares predicted and observed 
  "unconditional" cell counts (without respect to LC membership.
  Also calculate and keep a Likelihood ratio chi square*/
data pchisum;
  set two (where=(x=1)) end=last;
  pchi + (((freq-expect)**2)/expect);
  lchi + (2*freq*log((freq+1e-5)/expect));
  keep pchi lchi;
  if last;
run;

/*Calculate and keep a GOF statistic that compares 
"conditional" predicted and TRUE VALUES 
 WHEN THE ALGORITHM IS TESTED ON SIMULATED DATA*/
data gofsum;
  set two end=last;
  if true=0 then true=1;
  gof + (((co-true)**2)/true);
  if last;
run;

/*collect parameter estimates 
(in work.lamdas) and fit statistics (work.gof, work.pchisum, work.mlr,
work.ithist) at each iteration step.*/
data thisiter;
	if _n_=1 then set gofsum (keep=gof);
	if _n_=1 then set pchisum;
	if _n_=1 then set lambdas;
	if _n_=1 then set mlr 
		(where=(source in ("Likelihood Ratio", "Residual")) keep=source chisq df probchisq);
	if _n_=1 then set ithist;
  set four (where=(a=1 and b=1 and c=1 and d=1)); /*displays cond probs for var=1*/
    length method $5.;
	rep=&repet;
	intrep=&intrep;
	iteration=&iter;
	itdiff=&itdiff;
	method=symget('meth');
		
/*DF for goodness of fit statistics, from H&M(2002, Eq.9, p.67*/
	lr_df=(2*2*2*2-1)- (2*(2+2+2+2-(4-1))-1);
	p_lr=1-probchi(pchi,lr_df);
	p_gof=1-probchi(gof,df);
	label pchi='X2'
		lchi='G2'
		chisq='LR from CATMOD'
		df='loc indep df'
		probchisq='loc indep p'
    %if &meth=wls %then %do;
		label chisq="Deviance from CATMOD";
	%end;
	drop a b c d;	
run;

/*For diagnostic purposes, write macro values to log*/
%put simulated rep &repet internal rep &intrep iteration &iter &itdiff;

/*Append output from each iteration step into data work.summary*/
proc append force data=thisiter base=summary;
run;
%let iter=%eval(&iter+1);
%end;		/*END OF ITERATION LOOP*/

/*Delete appended dataset WORK.STOPCRIT after this set of iterations*/
proc datasets;
  delete stopcrit;
run;
quit;

%end;		/*END OF INTERNAL REPETITION LOOP*/
%end;		/*END OF REPETITION LOOP*/

/*Save work.summary as permanent SAS dataset*/
data sim.rep_200solns_on_rep25;
 set summary;
run;
%mend lcacatmod;
run;


/*extract estimates from each iterated process
and define iterations as successful (goorep=1) or unsuccessful (goodrep=0)*/
proc printto;
run;
data checksamp;
  set summary;
  by rep intrep notsorted;
  if last.intrep;
  if (iteration le 3 and itdiff=0) or iteration=40 then goodrep=0;
  else goodrep=1;
  if (iteration le 3 and itdiff=0) then badinit=1;
run;
/*In general, 40 percent of iterated solutions are successful*/
proc freq data=checksamp;
   tables  goodrep badinit;
run;


/*Plot raw and "refleced" estimates of conditional parameters*/

/*LCA iteratively "completes" an unconditional contingency table 
by assigning response profiles to levels of the latent variable.  
The variables' values (X=1,2) have no intrinsic meaning to help 
the algorithm distinguish between them.  The algorithm can associate 
a set of conditional probabilities with the latent class X=1 during one replication, 
and assign the same set of conditional probabilities to the class X=2 
during the next replication. This results in "fractured" distributions 
of parameter estimates, unless the estimates are somehow aggregated in logical groups.
The step below does this by "reflecting" estimates according to 
a logical algorithm whereby observations from each half of the plot 
 where p(var|x1)=p(var|x2) are combined with their counterpart observations.

The direction of the reflection depends on which half
of the plot is more heavily populated.  The more heavily populated 
halfplot should be the target of the reflection.*/
%macro plott (var);
title1 "Conditional probabilities for variable &var";
data csamp3 big;
/*keep only estimates from successful iterations for plotting*/
set checksamp  
	(where=(goodrep=1) 
	keep= goodrep rep intrep iteration
	px1 px2 
    pa_x1 pb_x1 pc_x1 pd_x1 
    pa_x2 pb_x2 pc_x2 pd_x2)
    end=last;
/*Which half of the plot has more data points. 
  That is,on balance, 
  is p(indicator)|x1 greater than p(indicator)|x2???*/
  totaln+1;   /*number of estimates*/
  if p&var._x1 ge p&var._x2 then x1big+1; 
  		/*no. of estims where p(indicator)|x1 greater than p(indicator)|x2*/
  output csamp3;
  if last then output big;
run;

data csamp&var;
  if _n_=1 then set big(keep=x1big totaln);
  set csamp3 (drop=totaln x1big goodrep);
/*if most observations are in upper left of plot, reflect that way*/
  if x1big/totaln ge 0.5 then do;
    if p&var._x1 ge p&var._x2 then do;
      p&var.cx1=p&var._x1;
	  p&var.cx2=p&var._x2;
	end;
	if p&var._x1 lt p&var._x2 then do;
      p&var.cx1=p&var._x2;
	  p&var.cx2=p&var._x1;
    end;
  end;

/*if most observations are in lower right of plot, reflect that way*/
  if x1big/totaln lt 0.5 then do;
    if p&var._x1 ge p&var._x2 then do;
      p&var.cx1=p&var._x2;
	  p&var.cx2=p&var._x1;
	end;
	if p&var._x1 lt p&var._x2 then do;
      p&var.cx1=p&var._x1;
	  p&var.cx2=p&var._x2;
    end;
 end;
keep rep intrep px1 px2 p&var._x1 p&var._x2 p&var.cx1 p&var.cx2;
run;

goptions reset=all htext=1 ftext=swissb ftitle=swissb htitle=1.5;
axis7 order=(0 to 1 by .1) minor=(n=1);
proc gplot data=csamp&var;
  plot p&var._x1*p&var._x2 / vaxis=axis7 haxis=axis7;
  title2 "Raw estimates of P(&var.|X=1) vs P(&var.|X=2)";
run;
proc gplot data=csamp&var;
  plot p&var.cx1*p&var.cx2 / vaxis=axis7 haxis=axis7;
  title2 "Reflected estimates of P(&var.|X=1) vs P(&var.|X=2)";
run;

proc sort data=csamp&var;
 by rep intrep;
run;
%mend plott;

%plott(A)
%plott(B)
%plott(C)
%plott(D)
;

/*merge all estimates*/
data f1;
  merge csampa csampb csampc csampd;
  by rep intrep;
run;
/*summarize estimates by rep and intrep*/
proc means noprint data=f1 mean;
  by rep intrep;
  var px1 px2 pacx1 pbcx1 pccx1 pdcx1 pacx2 pbcx2 pccx2 pdcx2; 
  output out=f2 (drop=_type_ _freq_) mean= ;
run;

/*Plot estimates*/
%macro plott (var);
goptions reset=all htext=1 ftext=swissb ftitle=swissb htitle=1.5;
axis1 label=('N') minor=none;
axis2  label=none value=(f=swissb h=.8);
pattern c=blue;
*proc gchart data=f2;
  *by rep;
  *vbar px1 / midpoints=0 to 1.00 by 0.02 space=0 coutline=black woutline=2
                   noframe raxis=axis1 maxis=axis2;
  *title1 j=c "Estimates of latent class prevalence P(X=1)";
*run;

proc gchart data=f2;
  by rep;
  vbar p&var.cx1 / midpoints=0 to 1.00 by 0.02 space=0 coutline=black woutline=2
                   noframe raxis=axis1 maxis=axis2;
  title1 j=c "Estimates of conditional probability P(&var.|X=1)";
run;
proc gchart data=f2;
  by rep;
  vbar p&var.cx2 / midpoints=0 to 1.00 by 0.02 space=0 coutline=black woutline=2
                  noframe raxis=axis1 maxis=axis2;
  title1 j=c "Estimates of conditional probability P(&var.=1|X=2)";
run;
quit;
%mend plott;

%plott(A)
%plott(B)
%plott(C)
%plott(D)
;