#delim ;
prog def dsweight, sortpreserve;
version 10.0;
/*
 Input a varlist of standardization variables
 and generate a variable containing direct standardization weights,
 for input as pweights to other Stata commands,
 to produce results standardized
 by combinations of the input variables
 to the joint distribution of these variables in a target population,
 which is either the total sample,
 or a by-group,
 or an outside standard population,
 specified in a using dataset with 1 observation per combination
 of the variables in the input varlist,
 and data on frequencies of that combination in the target population.
*!Author: Roger Newson
*!Date: 18 January 2012
*/

syntax varlist [if] [in] [using/] [fweight pweight iweight aweight] , Generate(name)
  [ GRoupvars(varlist) BY(varlist) noCOmplete TFReqvar(name) sorted Missing float fast
  ];
/*
 generate() specifies the name of the new variable to be generated,
   containing direct standardization weights.
 groupvars() specifies a varlist,
   whose combinations specify groups
   within which the generated weights will standardize the distribution
   to the target population.
 by() specifies a varlist,
   whose combinations specify by-groups,
   each of which has its own target population
   to which the subsets specified by the groupvars() varlist are to be standardized,
   either in the total sample in the by-group
   or in an outside target population specified by corresponding by-groups
   in the using dataset.
 nocomplete specifies that all combinations of the input varlist
   do not have to be represented in each combination of the groupvars() varlist,
   either in the whole input sample
   or in the by-group if by() is specified.
 tfreqvar() specifies the name of a numeric variable in the using dataset,
   containing (or proportional to) the frequency, in each observation,
   of the corresponding combination of the input varlist in the target population.
 sorted specifies that the using dataset is already sorted
   primarily by the by-variables (if by() is specified)
   and secondarily by the varlist,
   so dsweight does not have to do so.
 missing specifies that missing direct standardization weights may be generated.
 float specifies that the generated variable specified by generate()
   will have type float or lower.
 fast specifies that dsweight will not do any work
   to restore the original input dataset
   if the user presses Break.
*/

*
 Set target frequency variable name if required
 and check that it does not already exist
*;
if `"`using'"'!="" {;
  if "`tfreqvar'"=="" {;local tfreqvar="_freq";};
  cap conf var `tfreqvar';
  if !_rc {;
    disp as error "Variable `tfreqvar' already exists";
    error 498;
  };
};

if "`fast'"=="" {;preserve;};

marksample touse, strok;
markout `touse' `groupvars', strok;

*
 Generate input weights
*;
tempvar inpwei;
if `"`exp'"'=="" {;
  qui gene byte `inpwei'=1 if `touse';
};
else {;
  qui gene double `inpwei' `exp' if `touse';
  qui compress `inpwei';
};

*
 Calculate sample and target population sums of weights
*;
tempvar sampsw tarpsw;
qui bysort `touse' `by' `varlist' `groupvars': egen `sampsw'=total(`inpwei') if `touse';
if `"`using'"'=="" {;
  * Target population is total sample *;
  qui by `touse' `by' `varlist': egen double `tarpsw'=total(`inpwei') if `touse';
};
else {;
  * Target population is specified in using dataset *;
  qui {;
    sort `by' `varlist';
    merge m:1 `by' `varlist' using `"`using'"', `sorted' noreport nolabel nonotes nogenerate keep(master match) keepus(`tfreqvar');
    gene double `tarpsw'=`tfreqvar' if `touse';
    drop `tfreqvar';
  };
};
qui {;
  compress `sampsw' `tarpsw';
  gene double `generate'=`tarpsw'/`sampsw' if `touse';
  if "`float'"!="" {;recast float `generate', force;};
  compress `generate';
};

*
 Check for missing standardization weights if required
*;
if "`missing'"=="" {;
  tempname nmiss;
  qui count if `touse' & missing(`generate');
  scal `nmiss'=r(N);
  if `nmiss'>0 {;
    disp as error "Missing standardization weights for " `nmiss' "observations in the sample"
      _n as error "Use option missing to allow missing standardization weights";
    error 498;
  };
};

*
 Check for completeness if required
*;
if "`complete'"!="nocomplete" {;
  tempvar tag1 sumtag1 tag2 sumtag2;
  tempname nincomp;
  qui {;
    bysort `touse' `by' `varlist': gene byte `tag1'=_n==1 if `touse';
    by `touse' `by': egen double `sumtag1'=total(`tag1') if `touse';
    bysort `touse' `by' `groupvars' `varlist': gene byte `tag2'=_n==1 if `touse';
    by `touse' `by' `groupvars': egen double `sumtag2'=total(`tag2') if `touse';
    compress `sumtag1' `sumtag2';
    count if `touse' & (`sumtag1'!=`sumtag2');
    scal `nincomp'=r(N);
  };
  if `nincomp'>0 {;
    disp as error `nincomp' " observations in the sample are in groups"
      _n as error "defined by variables:"
      _n as error "`groupvars'"
      _n as error "with incomplete range of value combinations for variables:"
      _n as error "`varlist'"
      _n as error "Use option nocomplete to allow incomplete groups";
    error 498;
  };
};

if "`fast'"=="" {;restore, not;};

end;