// dta2md.ado
// Generate Meta Data from Stata system file
// Klaus Pforr
// 2019-10-10
// version 1.0.17
// CC BY-NC-SA 4.0
//
// Change log
// v2: 2019-02-28 no mata
// v3: 2019-02-28 streamlining of program structure draft
// v4: 2019-03-01 bugfixing in input control, solved abbreviation of group var
// v5: 2019-03-01 program structure draft clear
// v6: 2019-03-04 specification of output variable formats necessary for postfile, 
//                length of longest value label, first implementation steps of 
//                program draft
// v7: 2019-03-05 finishing implementation for "if v in freqvarlist"
// v8: 2019-03-05 restructuring program draft -> integrate distinction between
//                "if v in freqvarlist", "if v not in freqvarlist and nocount option specified",
//                "if v not in freqvarlist and nocount option not specified" in loop over variables and all levels
// v9: 2019-05-09 debugging
//                drop nocount option
// v10: 2019-05-13 debugging
//                 drop abbreviation of group var
// v11: 2019-05-14 revert to v10
//                 add mother
//                 debug input check problems
// v12: 2019-05-16 debug problems with relation
//                 debug firstVar: rename to first, and 1 for first level within groups
//                 debug: only list cases for which we have values
//                 debug group var: type at least str3
// v13: 2019-07-02 still problem with relation
//                    check relationship problems only if relation specified
//                 check: only list cases for which we have values?
//                 check: group var: type at least str3
//                 add progress report
// v14: 2019-07-08 debug unsolved problems with input control
//                 simplify code / remove redundancies between parts for freqvarlist vars and for non-freqvarlist vars
// v15: 2019-07-08 solve bug first variable for non-freqvarlist vars =1 for als group vars groups
// v16: 2019-08-13 bug with empty missing option
// v17: 2019-10-10 add check for continuous variables in freqvarlist, change version to 10.0
//
// Draft of program:
// loop over vars v
//   if v in freqvarlist
//     loop over all levels l of var v
//       post varName, variableLabel, total_n, total_missing, min, max, Mean, StandardDeviation for each level l for group "all",
//            value, valueLabel, n, percent, validPercent, isValid for each level l for group "all"
//       if first level l of var v 
//         post first in first level l for group "all"
//       if group var specfied
//         loop over levels of group var g
//           post varName, variableLabel, total_n, total_missing, min, max, Mean, StandardDeviation for each level l for group g
//           post value, valueLabel, n, percent, validPercent, isValid for each level l for group g
//   if v not in freqvarlist
//     post varName, variableLabel, total_n, total_missing, min, max, Mean, StandardDeviation once for group "all"
//     if group var specfied
//       loop over levels of group var g
//         post varName, variableLabel, total_n, total_missing, min, max, Mean, StandardDeviation once for group g

program define dta2md, nclass
  version 10.0
  syntax, OUTput(string) INput(string) [FREQVARlist(string) GRoup(string) REPLace MISSingdef(string) SMISSingdef(string) RELATion(string)]

  // preserve current status
  quietly preserve
  
  // Input control
  if `"`input'"'!="" {
    // Check input file option
    capture confirm file `"`input'"'
	if _rc!=0 {
	  noisily di as error `"input file `input' not found"'
	  quietly exit
	}

	if _rc==0 {
	  quietly use `"`input'"', clear
	  // Create list of variable over which loops runs
	  quietly ds
	  local varlist `r(varlist)'
	  local nvars : list sizeof varlist

	  // Check freqvarlist option
	  if `"`freqvarlist'"'!="" {
	    fvexpand `freqvarlist'
	    local freqvarlistexp `r(varlist)'
	    capture confirm variable `freqvarlistexp'
	    if _rc!=0 {
	      noisily di as error "One variable in freqvarlist cannot be interpreted as variable"
		  quietly exit
	    }
		quietly ds `freqvarlistexp'
		local contvars=0
		foreach var of varlist `r(varlist)' {
		   capture levelsof `var'
		   if _rc!=0 {
		     local contvars=`contvars'+1
			 noisily di as error "`var' has too many levels."
		   }
		}
		if `contvars'>0 {
		  noisily di as error "Drop",cond(`contvars'==1,"this variable","these variables"),"from the freqvarlist."
		  quietly exit
		}
	  }

	  // Check group option
	  if `"`group'"'!="" {
	    fvexpand `group'
		local groupexp `r(varlist)'
		if `:list sizeof groupexp'>1 {
		  noisily di as error `"Group variable `groupexp' specifies more than one variable"'
		  quietly exit
		}
	    capture confirm variable `groupexp'
		if _rc!=0 {
		  noisily di as error `"Group variable `groupexp' cannot be interpreted as variable"'
		  quietly exit
		}
	  }

	  // Check relationship option
	  if "`relation'"!="" {
	    tempvar test1 test2
	    quietly gen `test1'=""
		quietly gen byte `test2'=.
		local relationproblem=0
		quietly ds
		foreach var in `r(varlist)' {
		  quietly replace `test1'=cond(regexm("`var'",`"`relation'"')==0,"",regexr("`var'",`"`relation'"',""))
		  quietly replace `test2'=regexm(`test1',`"`relation'"')
		  quietly count if `test2'==1
		  if `r(N)'>1 {
			di as error "Multiple levels of child-mother relationship in variable `var' detected (child-mother-grandmother)"
			local relationproblem=1
		  }
		}
		if `relationproblem'==1 {
		  di as error "Current version supports only one level of child-mother relationship"
		  quietly exit
		}
		drop `test1' `test2'
	  }

	  // Check Missing definitions
	  if `"`missingdef'"'!="" {
	    if regexm(`"`missingdef'"',"X")==0 {
		    noisily di as error "Missing definition does not relate to any variable"
		    quietly exit
		}
	  }
	  if `"`missingdef'"'!="" {
        quietly ds, has(type numeric)
		foreach var in `r(varlist)' {
		  capture quietly count if (`=subinstr(`"`missingdef'"',"X","`var'",.)')==0|(`=subinstr(`"`missingdef'"',"X","`var'",.)')==1
		  if _rc!=0 | r(N)!=_N {
		    noisily di as error "Error in missing definition for variable `var'"
			quietly exit
		  }
		}
      }
	  if `"`smissingdef'"'!="" {
	    if regexm(`"`smissingdef'"',"X")==0 {
		  noisily di as error "String missing definition does not relate to any variable"
		  quietly exit
		}
	  }
	  if `"`smissingdef'"'!="" {
        quietly ds, has(type string)
		foreach var in `r(varlist)' {
		  capture quietly count if (`=subinstr(`"`smissingdef'"',"X","`var'",.)')==0|(`=subinstr(`"`smissingdef'"',"X","`var'",.)')==1
		  if _rc!=0 | r(N)!=_N {
		    noisily di as error "Error in string missing definition for variable `var'"
			quietly exit
		  }
		}
      }
	}
  }

  // Check output file
  if `"`output'"'=="" {
    noisily di as error "No output file specified"
	quietly exit
  }
  if `"`output'"'!="" {
    tempname outputcheck
    capture noisily file open `outputcheck' using `output', write binary `replace'
	if _rc!=0 {
	  quietly exit
	}
	capture file close _all
	if `"`replace'"'=="" {
	  noisily confirm new file `output'
	  if _rc!=0 {
	    noisily di as error "Output file exists"
		quietly exit
	  }
	}
  }

  // Process group variable
  if "`groupexp'"!="" {
    // - recast numerical to string
    if substr("`:type `groupexp''",1,3)!="str" {
      quietly tempvar groupexpstr
      // if numerical variable has value labels, use decode to replace with labels
      if (`"`:value label `groupexp''"'!="") {
        quietly decode `groupexp', gen(`groupexpstr')
      }
      // if numerical variable has no labels, use string to replace with actual values
      if (`"`:value label `groupexp''"'=="") {
        quietly gen `groupexpstr'=string(`groupexp',"`:format `groupexp''")
      }
    }
    // - produce uppercase string
    if substr("`:type `groupexp''",1,3)!="str" {
      quietly replace `groupexpstr'=ustrupper(`groupexpstr')
    }
    if substr("`:type `groupexp''",1,3)=="str" {
      quietly replace `groupexp'=ustrupper(`groupexp')
    }
  }

  // Open output file
  // Find out maximum length for value labels for type specification
  local valuelabellength=0
  foreach var in `varlist' {
    if (`"`:value label `var''"'!="") {
	  quietly levelsof `var'
	  foreach level in `r(levels)' {
	    local valuelabellength=max(`valuelabellength',strlen(`"`:label (`var') `level',strict'"'))
	  }
	}
  }
  // Find out groupvar type
  if "`groupexp'"=="" {
    local groupexptype="str3"
  }
  if "`groupexp'"!="" {
    if substr("`:type `groupexp''",1,3)!="str" {
      local groupexptype="`="str"+strofreal(max(3,real(substr("`groupexpstr'",4,.))))'"
    }
	if substr("`:type `groupexp''",1,3)=="str" {
      local groupexptype="`="str"+strofreal(max(3,real(substr("`groupexp'",4,.))))'"
	}
  }
  
  // Actual postfile
  tempname out
  quietly postfile `out' `groupexptype' group /*
    */ byte computed str32 varName str80 variableLabel float (total_n total_missing /*
    */ min max Mean StandardDeviation) str32 value str`valuelabellength' valueLabel /*
	*/ float (n percent validPercent isValid first) /*
    */ using `output', `replace'

  // Progress report: header first
  noisily _dots 0, title(Variables processed) reps(`nvars')

  // Loop over vars
  // Loop step counter
  local v=1
  foreach var in `varlist' {
    // Replace X with variable in missing option
	if `"`missingdef'"'!="" {
	  local _missingdef=subinstr(`"`missingdef'"',"X","`var'",.)
    }
    if `"`smissingdef'"'!="" {
	  local _smissingdef=subinstr(`"`smissingdef'"',"X","`var'",.)
    }
	// Set missing def to zero if not used
	if `"`missingdef'"'=="" {
	    local _missingdef=0
	}
	if `"`smissingdef'"'=="" {
	    local _smissingdef=0
	}

    // Compute variable level info for all that cannot be posted on the fly
	// -> total_n total_missing min max Mean StandardDeviation first
	// total_n
	local total_n=_N
	// total_missing
	if substr("`:type `var''",1,3)!="str" {
	  quietly count if (`_missingdef')==1
	}
	if substr("`:type `var''",1,3)=="str" {
	  quietly count if (`_smissingdef')==1
	}
	local total_missing=`r(N)'
	// min max Mean StandardDeviation
	if substr("`:type `var''",1,3)!="str" {
	  quietly sum `var' if (`_missingdef')==0
    }
	local min=r(min)
	local max=r(max)
	local Mean=r(mean)
	local StandardDeviation=r(sd)
	// first
	local first=1
	// Different parts if variable in freqvarlist or not
	if (`:list var in freqvarlistexp')==1 {
	  // loop over all levels of var
	  quietly levelsof `var', local(varlevels)
	  foreach vlevel in `varlevels' {
	    // Compute value level info that cannot be posted on the fly
		// -> value valueLabel n percent validPercent isValid
		// value
		if substr("`:type `var''",1,3)!="str" {
	      local value=`"`=string(`vlevel',"`:format `var''")'"'
		}
		if substr("`:type `var''",1,3)=="str" {
	      local value=`"`vlevel'"'
		}
	    // valueLabel
		if substr("`:type `var''",1,3)!="str" {
	      local valueLabel=`"`:label (`var') `vlevel',strict'"'
		}
		if substr("`:type `var''",1,3)=="str" {
		  local valueLabel=""
		}
		// n percent validPercent
		if substr("`:type `var''",1,3)!="str" {
		  quietly count if `var'==`vlevel'
		}
		if substr("`:type `var''",1,3)=="str" {
		  quietly count if `var'==`"`vlevel'"'
		}
		local n=r(N)
		local percent=`=`n''/`=`total_n''*100
		local validPercent=`=`n''/(`=`total_n''-`=`total_missing'')*100
		// isValid
		local isValid=.
		if substr("`:type `var''",1,3)!="str" {
		  quietly count if `var'==`vlevel' & (`_missingdef')==0
		}
		if substr("`:type `var''",1,3)=="str" {
		  quietly count if `var'==`"`vlevel'"' & (`_smissingdef')==0
		}
		if `r(N)'==`n' {
		  local isValid=1
		}
		if substr("`:type `var''",1,3)!="str" {
		  quietly count if `var'==`vlevel' & (`_missingdef')==1
		}
		if substr("`:type `var''",1,3)=="str" {
		  quietly count if `var'==`"`vlevel'"' & (`_smissingdef')==1
		}
		if `r(N)'==`n' {
		  local isValid=0
		}
    	// actual post of info for all values and group "all"
	    // post only if number of cases for this value >0
	    if `n'>0 & `n'!=. {
          quietly post `out' ("all") (`:list var in freqvarlistexp') (`"`var'"') (`"`:variable label `var''"') /*
		  */ (`total_n') (`total_missing') (`min') (`max') (`Mean') (`StandardDeviation') /*
		  */ (`"`value'"') (`"`valueLabel'"') /*
		  */ (`n') (`percent') (`validPercent') (`isValid') (`first')
		  local first=0
	    }
      }
	}
	if (`:list var in freqvarlistexp')==0 {
	  // Set value valueLabel n percent validPercent isValid to missing
	  local value=""
	  local valueLabel=""
	  local n=.
	  local percent=.
	  local validPercent=.
	  local isValid=.
	  // actual post of info overall and group "all" /*problem*/
	  quietly post `out' ("all") (`:list var in freqvarlistexp') ("`var'") (`"`:variable label `var''"') /*
      */ (`total_n') (`total_missing') (`min') (`max') (`Mean') (`StandardDeviation') /*
      */ (`"`value'"') (`"`valueLabel'"') /*
      */ (`n') (`percent') (`validPercent') (`isValid') (`first')
	  local first=0
	}
    // group var specified
	if `"`groupexp'"'!="" {
	  // loop over all levels of group var
	  if substr("`:type `groupexp''",1,3)!="str" { 
		quietly levelsof `groupexpstr', local(groupexplevels)
      }
	  if substr("`:type `groupexp''",1,3)=="str" { 
		quietly levelsof `groupexp', local(groupexplevels)
	  }
	  foreach glevel in `groupexplevels' {
		// Compute variable level info for each group var level that cannot be posted on the fly
	    // -> total_n total_missing min max Mean StandardDeviation first
	    // total_n
        if substr("`:type `groupexp''",1,3)!="str" { 
		  quietly count if `groupexpstr'==`"`glevel'"'
		}
		if substr("`:type `groupexp''",1,3)=="str" { 
		  quietly count if `groupexp'==`"`glevel'"'
		}
		local total_n=`r(N)'
		// total_missing
	    if substr("`:type `var''",1,3)!="str" {
		  if substr("`:type `groupexp''",1,3)!="str" { 
	        quietly count if (`_missingdef')==1 & `groupexpstr'==`"`glevel'"'
		  }
		  if substr("`:type `groupexp''",1,3)=="str" { 
			quietly count if (`_missingdef')==1 & `groupexp'==`"`glevel'"'
		  }
	    }
	    if substr("`:type `var''",1,3)=="str" {
		  if substr("`:type `groupexp''",1,3)!="str" { 
	        quietly count if (`_smissingdef')==1 & `groupexpstr'==`"`glevel'"'
		  }
		  if substr("`:type `groupexp''",1,3)=="str" { 
			quietly count if (`_smissingdef')==1 & `groupexp'==`"`glevel'"'
		  }
	    }
	    local total_missing=`r(N)'
		// min max Mean StandardDeviation
	    if substr("`:type `var''",1,3)!="str" {
		  if substr("`:type `groupexp''",1,3)!="str" { 
	        quietly sum `var' if (`_missingdef')==0 & `groupexpstr'==`"`glevel'"'
		  }
		  if substr("`:type `groupexp''",1,3)=="str" { 
			quietly sum `var' if (`_missingdef')==0 & `groupexp'==`"`glevel'"'
		  }
	    }
	    local min=r(min)
	    local max=r(max)
	    local Mean=r(mean)
	    local StandardDeviation=r(sd)
		// first
	    local first=1
		if (`:list var in freqvarlistexp')==1 {
		  // loop over all levels of var
	      quietly levelsof `var', local(varlevels)
	      foreach vlevel in `varlevels' {
		    // Compute value level info that cannot be posted on the fly
		    // -> value valueLabel n percent validPercent isValid
		    // value
			if substr("`:type `var''",1,3)!="str" {
	          local value=`"`=string(`vlevel',"`:format `var''")'"'
			}
			if substr("`:type `var''",1,3)=="str" {
	          local value=`"`vlevel'"'
			}
	        // valueLabel
			if substr("`:type `var''",1,3)!="str" {
	          local valueLabel=`"`:label (`var') `vlevel',strict'"'
			}
			if substr("`:type `var''",1,3)=="str" {
			  local valueLabel=""
			}
	        // n percent validPercent
	        if substr("`:type `var''",1,3)!="str" {
			  if substr("`:type `groupexp''",1,3)!="str" { 
	            quietly count if `var'==`vlevel' & `groupexpstr'==`"`glevel'"'
			  }
			  if substr("`:type `groupexp''",1,3)=="str" { 
	            quietly count if `var'==`vlevel' & `groupexp'==`"`glevel'"'
			  }
	        }
	        if substr("`:type `var''",1,3)=="str" {
			  if substr("`:type `groupexp''",1,3)!="str" { 
	            quietly count if `var'==`"`vlevel'"' & `groupexpstr'==`"`glevel'"'
			  }
			  if substr("`:type `groupexp''",1,3)=="str" { 
	            quietly count if `var'==`"`vlevel'"' & `groupexp'==`"`glevel'"'
			  }
	        }
			local n=r(N)
	        local percent=`=`n''/`=`total_n''*100
	        local validPercent=`=`n''/(`=`total_n''-`=`total_missing'')*100
	        // isValid
	        local isValid=.
			// Special case: valid case for all groups combined but no cases for this group -> isValid=1
			if substr("`:type `var''",1,3)!="str" {
	            quietly count if `var'==`vlevel' & (`_missingdef')==0
	        }
	        if substr("`:type `var''",1,3)=="str" {
	            quietly count if `var'==`"`vlevel'"' & (`_smissingdef')==0
	        }
			local one_valid_across_all_groups=`=(`r(N)'>0 & `r(N)'!=.)'
			// Count non-missings for this group
	        if substr("`:type `var''",1,3)!="str" {
			  if substr("`:type `groupexp''",1,3)!="str" { 
	            quietly count if `var'==`vlevel' & (`_missingdef')==0 & `groupexpstr'==`"`glevel'"'
			  }
			  if substr("`:type `groupexp''",1,3)=="str" { 
	            quietly count if `var'==`vlevel' & (`_missingdef')==0 & `groupexp'==`"`glevel'"'
			  }
	        }
	        if substr("`:type `var''",1,3)=="str" {
			  if substr("`:type `groupexp''",1,3)!="str" { 
	            quietly count if `var'==`"`vlevel'"' & (`_smissingdef')==0 & `groupexpstr'==`"`glevel'"'
			  }
			  if substr("`:type `groupexp''",1,3)=="str" { 
	            quietly count if `var'==`"`vlevel'"' & (`_smissingdef')==0 & `groupexp'==`"`glevel'"'
			  }
	        }
	        if (`one_valid_across_all_groups'==1 & `r(N)'==0) | (`one_valid_across_all_groups'==1 & `r(N)'>0 & `r(N)'==`n') {
	          local isValid=1
	        }
	        if substr("`:type `var''",1,3)!="str" {
			  if substr("`:type `groupexp''",1,3)!="str" { 
	            quietly count if `var'==`vlevel' & (`_missingdef')==1 & `groupexpstr'==`"`glevel'"'
			  }
			  if substr("`:type `groupexp''",1,3)=="str" { 
	            quietly count if `var'==`vlevel' & (`_missingdef')==1 & `groupexp'==`"`glevel'"'
			  }
	        }
	        if substr("`:type `var''",1,3)=="str" {
			  if substr("`:type `groupexp''",1,3)!="str" { 
	            quietly count if `var'==`"`vlevel'"' & (`_smissingdef')==1 & `groupexpstr'==`"`glevel'"'
			  }
			  if substr("`:type `groupexp''",1,3)=="str" { 
	            quietly count if `var'==`"`vlevel'"' & (`_smissingdef')==1 & `groupexp'==`"`glevel'"'
			  }
	        }
	        if `one_valid_across_all_groups'==0 | (`one_valid_across_all_groups'==1 & `r(N)'>0 & `r(N)'==`n') {
	          local isValid=0
	        }
		    // actual post of info for all values and all groups
			// post only if number of cases for this value >0
			if `n'>0 & `n'!=. {
	          quietly post `out' (`"`glevel'"') (`:list var in freqvarlistexp') ("`var'") (`"`:variable label `var''"') /*
		      */ (`total_n') (`total_missing') (`min') (`max') (`Mean') (`StandardDeviation') /*
		      */ (`"`value'"') (`"`valueLabel'"') /*
		      */ (`n') (`percent') (`validPercent') (`isValid') (`first')
			  local first=0
			}
		  }
		}
		if (`:list var in freqvarlistexp')==0 {
		  // Set value valueLabel n percent validPercent isValid to missing
		  local value=""
		  local valueLabel=""
		  local n=.
		  local percent=.
		  local validPercent=.
		  local isValid=.
		  // END if (`:list var in freqvarlistexp')==0 {
		  // actual post of info for all values and all groups
	      quietly post `out' ("`glevel'") ((`:list var in freqvarlistexp')) ("`var'") ("`:variable label `var''") /*
		    */ (`total_n') (`total_missing') (`min') (`max') (`Mean') (`StandardDeviation') /*
		    */ (`"`value'"') (`"`valueLabel'"') /*
		    */ (`n') (`percent') (`validPercent') (`isValid') (`first')
		  local first=0
		}
	  }
    }
    // End of one variable step -> Progress report -> Print dot
    noisily _dots `v' 0
    local v=`v'+1
  }

  // Close output file
  quietly postclose `out'
  
  // Add mother variable
  if `"`relation'"'!="" {
    quietly use `output', clear
	quietly gen mother=cond(regexm(varName,`"`relation'"')==0,"",regexr(varName,`"`relation'"',""))
	quietly order group computed varName mother
	quietly save `output', replace
  }
  
  // Restore previous data
  quietly restore
end