********************************************************************************
/*

Title: 						ODK Validate

Author: 					Kabira Namit

Date created: 				21-10-2022

Date updated:				30-01-2024

*/
********************************************************************************

/*

Introducing ODK Validate: A survey validation program for KoboToolbox/ SurveyCTO and Stata users

I'm excited to share a Stata program I've developed called ODK Validate. This program should be useful for anyone using ODK/ XLSforms and Stata for data collection and analysis. It's designed to pre-emptively catch and resolve errors that can often go unnoticed during deployment but can cause significant headaches during data analysis.

What It Does:

Identifies duplicate variable names created by select_multiple questions (particularly when you export the data from such questions as separate columns)
Checks that integer questions are properly constrained
Checks for required notes (which prevent responses from being uploaded)
Checks for optional survey questions

Why It's Important:

Imagine you have a select_multiple question titled 'country' with choice list names 1, 2, 3, and 4, and another question in the survey named 'country2'. Your data export will now include two variables called country2 and this may lead to confusion and will cause challenges when you import data into Stata. ODK Validate catches these issues before deployment, saving you time and frustration.

Example Code: 

odk_validate

odk_validate, using(malawi_hfc)
 
*/

/*******************************************************************************

 								Setting defaults

*******************************************************************************/


capture program drop 	odk_validate
program 				odk_validate

version 17

    syntax [, USING(string)]
	
	
if "`using'" == "" {
	
	local using "odk"
}	

quietly {

/*******************************************************************************

 								Section I
								
			Duplicates generated by multiple response questions

*******************************************************************************/

* Import the relevant file

import excel "`using'.xlsx", sheet("survey") firstrow clear

* Keep only the relevant columns and split the type column into two parts (question type and choice list)

keep 			type name
split 			type
drop 			type
rename 			type1 type
rename 			type2 list_name
order 			type name list_name

* Save the unique var names as a distinct dataset

preserve
keep			name
sort 			name
drop if name == ""

* We will be dealing with pre-existing duplicate names in the ODK file later. So, let us save only the unique var names right now. 

bysort name: gen count = _n
keep if count           == 1 
drop count

save "odk_var_names.dta", replace
restore

* Keep only the select multiple questions

keep if type == "select_multiple"

	qui count if !(missing(name))

	if(`r(N)'> 0){
		
drop			type
rename 			name var_name
generate 		id = _n

save "odk_mr_cycle.dta", replace

* Cycle through all multiple response questions one at a time and append the multiple response var names to the master var name file

levelsof id, local(levels) 
 foreach l of local levels {

	use "odk_mr_cycle.dta", clear
	keep if id == `l'
	save "odk_`l'.dta", replace
	
	* Import the choice sheet
 
	import excel "`using'.xlsx", sheet("choices") firstrow clear
	keep list_name name
	merge m:1 list_name using "odk_`l'.dta"
	keep if _merge == 3
	
	 * Concatenate (for example, challenge1 challenge2 challenge3)
	 
	capture tostring name, replace
 
	gen mr_name = var_name + name
	keep mr_name
	rename mr_name name
	save "choices_`l'.dta", replace
	
	* Append each list 
	
	use 			"odk_var_names.dta"
	append using 	"choices_`l'.dta"
	
	save			"odk_var_names.dta", replace
	
	* Erase datasets
	
	erase "odk_`l'.dta" 
	erase "choices_`l'.dta"
	
	
		}

	 
* Check for duplicates

use "odk_var_names.dta"

sort name
quietly by name:  gen dup = cond(_N==1,0,_n)
keep if dup > 0

* Report duplicates

putdocx clear
	putdocx begin
	putdocx paragraph, font(,12) halign(center) style(Title)
	putdocx text ("Duplicates created by multiple response questions"), bold underline
	
	* If duplicates exist
	
	qui count if !(missing(name))

	if(`r(N)'> 0){	
	
	putdocx paragraph , font(,12) style(Heading2)
	putdocx text ("There are duplicate variable names that are being generated by how you have coded multiple response questions.")
	putdocx paragraph , font(,12) halign(center) style(Heading2)
	putdocx text ("Duplicate names"), bold underline
	
	table	(name),	statistic(freq) nformat(%9.1f percent)
	collect label dim name "Duplicates", modify
	collect style putdocx, layout(autofitwindow)
	putdocx collect 
		
		}
	
	* If duplicates don't exist 
	
	qui count if !(missing(name))
	local N `r(N)'
	if(`r(N)'== 0){	
		
	putdocx paragraph , font(,12) style(Heading2)
	putdocx text ("There are no duplicates that are being generated.")
		
		}
					
	putdocx save "Duplicates generated by multiple response questions.docx", replace				

		 
* Erase Datasets

erase 		"odk_mr_cycle.dta"


		}
		
erase 		"odk_var_names.dta"

/*******************************************************************************

 								Section II
								
						Duplicate Variable Names

*******************************************************************************/

* Import the relevant file

import excel "`using'.xlsx", sheet("survey") firstrow clear

keep name
drop if name == ""

* Check for duplicates

sort name
quietly by name:  gen dup = cond(_N==1,0,_n)
keep if dup > 0

* Report duplicates

putdocx clear
	putdocx begin
	putdocx paragraph, font(,12) halign(center) style(Title)
	putdocx text ("Duplicate variable names"), bold underline

	* If duplicates exist
	
	qui count if !(missing(name))
	if(`r(N)'> 0){	
	
	putdocx paragraph , font(,12) style(Heading2)
	putdocx text ("You have used the same variable name for more than one survey question.")
	putdocx paragraph , font(,12) halign(center) style(Heading2)
	putdocx text ("Duplicate names"), bold underline
	
	table	(name),	statistic(freq) nformat(%9.1f percent)
	collect label dim name "Duplicates", modify
	collect style putdocx, layout(autofitwindow)
	putdocx collect 
		
		
		}
	
	* If duplicates don't exist 
	
	qui count if !(missing(name))
	if(`r(N)'== 0){	
		
	putdocx paragraph , font(,12) style(Heading2)
	putdocx text ("There are no duplicate variable names.")
		
		}
					
putdocx save "Duplicate Variable Names.docx", replace				



/*******************************************************************************

 								Section III
								
							  Compulsory Notes

*******************************************************************************/

* Import the relevant file

import excel "`using'.xlsx", sheet("survey") firstrow clear

* Create label and required variables, if they don't exist in the ODK survey

    capture confirm var label, exact
    if c(rc) == 111 {
		
capture confirm var labelEnglish

if _rc == 0 {
    gen label = labelEnglish
}

else {
    gen label = ""
}				

    }

    capture confirm var required, exact
    if c(rc) == 111 {
    gen required = ""
    }

keep type name label required
keep if type == 	"note"
drop type
capture tostring required, replace
keep if required == "true" | required == "TRUE" | required == "1" | required == "yes" | required == "YES"
drop required

generate 		id = _n
order           id name label

* Report compulsory notes

putdocx clear
	putdocx begin
	putdocx paragraph, font(,12) halign(center) style(Title)
	putdocx text ("Compulsory Notes"), bold underline

	* If compulsory notes exist
	
	qui count if !(missing(name))

	if(`r(N)'> 0){	
	
	putdocx paragraph , font(,12) style(Heading2)
	putdocx text ("Do you want to receive any submissions? You won't if you have compulsory notes!"), bold underline
	putdocx paragraph , font(,12) halign(center) style(Heading2)
	putdocx text ("Compulsory Notes"), bold underline
	putdocx table tbl_text = data(*), border(start, nil) border(insideV, nil) border(end, nil) 
				
		}
	
	* If compulsory notes don't exist 
	
	qui count if !(missing(name))

	if(`r(N)'== 0){	
		
	putdocx paragraph , font(,12) style(Heading2)
	putdocx text ("There are no compulsory notes. Phew, bullet dodged.")
		
		}
					
putdocx save "Compulsory Notes.docx", replace			

/*******************************************************************************

 								Section IV
								
							Non-compulsory questions

*******************************************************************************/

* Import the relevant file

import excel "`using'.xlsx", sheet("survey") firstrow clear

* Create label and required variables, if they don't exist in the ODK survey

    capture confirm var label, exact
    if c(rc) == 111 {
		
capture confirm var labelEnglish

if _rc == 0 {
    gen label = labelEnglish
}

else {
    gen label = ""
}				

    }

    capture confirm var required, exact
    if c(rc) == 111 {
    gen required = ""
    }

keep 			type name label required

split 			type
drop 			type
rename 			type1 type
rename 			type2 list_name
drop 			list_name

keep if type == "select_one" | type == "select_multiple" | type == "text" | type == "integer" 
drop type
capture tostring required, replace
keep if required == "false" | required == "FALSE" | required == "" | required == "0" | required == "." | required == "no" | required == "NO"
drop required

generate 		id = _n
order			id name label

* Report non-compulsory questions

putdocx clear
	putdocx begin
	putdocx paragraph, font(,12) halign(center) style(Title)
	putdocx text ("Non-compulsory questions"), bold underline

	* If non-compulsory questions exist
	
	qui count if !(missing(name))

	if(`r(N)'> 0){	
	
	putdocx paragraph , font(,12) style(Heading2)
	putdocx text ("Enumerators tend to skip or miss questions that aren't required.")
	putdocx paragraph , font(,12) halign(center) style(Heading2)
	putdocx text ("Non-compulsory questions"), bold underline
	putdocx table tbl_text = data(*), border(start, nil) border(insideV, nil) border(end, nil) 
		
		}
	
	* If non-compulsory questions don't exist 
	
	qui count if !(missing(name))

	if(`r(N)'== 0){	
		
	putdocx paragraph , font(,12) style(Heading2)
	putdocx text ("All important questions are required. Not bad!")
		
		}
					
putdocx save "Non-compulsory questions.docx", replace			

/*******************************************************************************

 								Section V
								
						 Integers without constraints

*******************************************************************************/

* Import the relevant file

import excel "`using'.xlsx", sheet("survey") firstrow clear

* Create label and constraint variables, if they don't exist in the ODK survey

    capture confirm var label, exact
    if c(rc) == 111 {
		
capture confirm var labelEnglish

if _rc == 0 {
    gen label = labelEnglish
}

else {
    gen label = ""
}				

    }

    capture confirm var constraint, exact
    if c(rc) == 111 {
    gen constraint = ""
    }

keep 			type name label constraint

split 			type
drop 			type
rename 			type1 type
rename 			type2 list_name
drop 			list_name

keep if type == "integer" 
drop type
capture tostring constraint, replace
keep if constraint == "" | constraint == "."
drop constraint

generate 		id = _n
order           id name label 


* Report no constraints for integer questions

putdocx clear
	putdocx begin
	putdocx paragraph, font(,12) halign(center) style(Title)
	putdocx text ("No constraints for integer questions"), bold underline

	* If there are integer questions without constraints
	
	qui count if !(missing(name))

	if(`r(N)'> 0){	
	
	putdocx paragraph , font(,12) style(Heading2)
	putdocx text ("How long do you want to spend on data cleaning?")
	putdocx paragraph , font(,12) halign(center) style(Heading2)
	putdocx text ("No constraints for integer questions"), bold underline
	putdocx table tbl_text = data(*), border(start, nil) border(insideV, nil) border(end, nil) 
		
		}
	
	* If all integer questions have constraints
	
	qui count if !(missing(name))
	local N `r(N)'
	if(`r(N)'== 0){	
		
	putdocx paragraph , font(,12) style(Heading2)
	putdocx text ("All integer questions (if there are any such questions in the survey) have constraints. Good stuff.")
		
		}
					
putdocx save "Integers without constraints.docx", replace	

/*******************************************************************************

							Append

*******************************************************************************/

import excel "`using'.xlsx", sheet("survey") firstrow clear

keep 			type
split 			type

keep if type1 == "select_multiple"

	qui count if !(missing(type1))

	if(`r(N)'> 0){

* Append all reports together

putdocx append 	"Compulsory Notes.docx" /// 
				"Non-compulsory questions.docx" ///
				"Duplicate Variable Names.docx" /// 
				"Duplicates generated by multiple response questions.docx" /// 
				"Integers without constraints.docx", ///
headsrc(first) saving("ODK Validate Report for `using'.docx", replace)


	}
	
	if(`r(N)' == 0){

* Append all reports together

putdocx append 	"Compulsory Notes.docx" /// 
				"Non-compulsory questions.docx" ///
				"Duplicate Variable Names.docx" /// 
				"Integers without constraints.docx", ///
headsrc(first) saving("ODK Validate Report for `using'.docx", replace)


	}

cap erase "Compulsory Notes.docx"
cap erase "Non-compulsory questions.docx"
cap erase "Duplicate Variable Names.docx" 
cap erase "Duplicates generated by multiple response questions.docx"
cap erase "Integers without constraints.docx"

}

clear

display "Your ODK validate report is ready and is available to view in your current directory."

end