/*******************************************************************************


excelclean automatically loads all excel files in a specified directory, 
organizes variable names and labels, reshapes the dataset if necessary, 
and integrates all files into a cleaned dataset

Author: Lu Han 
Last update: 24 Jul 2018

*******************************************************************************/


capture program drop excelclean 
program define excelclean 

	syntax , Datadir(string) sheet(string) cellrange(string) ///
	         [RESultdir(string) EXtension(string) namerange(integer 1) namelines(string) ///
			 Wordfilter(string) Droplist(string) pivot integrate]

			 
	version 13.1
	
	cd "`datadir'"
	if "`extension'" == "" {
		local extension xlsx
	}
	if "`resultdir'" == "" {
		local resultdir `c(pwd)'
	}
	if "`namelines'" == ""  {
		local namelines "1"
	}
	
	
	local allfiles : dir "." files "*.`extension'"  
		
	tempfile building
	clear    
	qui save `building', emptyok
	foreach f of local allfiles {
		di "Analyzing `f'"
		qui {
		
		import excel using "`f'", clear sheet(`sheet') cellrange(`cellrange') 
		
			local emptyid = 1
			foreach var of varlist _all {
				
				local label = ""
				foreach name of local namelines {
					local label = "`label'" + `var'[`name']	
				}
								
				if "`label'" == "" {
					local label "id`emptyid'_"
					local ++emptyid
				}
				
				local n_chars = length("`label'")					
				forv i = 1/`n_chars' { 
					 local char = substr("`label'", `i', 1) 
					 if inrange("`char'", "a", "z") | inrange("`char'", "A", "Z") | inrange("`char'", "0", "9") | "`char'" == " "   {
					 }
					 else {
						local label = subinstr("`label'","`char'"," ",1)				 
					 }
				}
				
				// debug 1
				di "processing `label'"
				foreach str of local wordfilter {
					local label = subinstr("`label'","`str' ","",.)
				}
				local label = stritrim("`label'")			
				// debug 2
				di "processing 2 `label'"
				
				local n_words : word count `label'           //word must be separated by a space
				local last_word `: word `n_words' of `label''
				local newname = subinstr("`label'","`last_word'","",.)  // delete the last word (may contain year) from the label 
				if regexm("`last_word'","[0-9]$") == 0 {
					local reshape_opt "string"
				}
				// debug 3
				di "new name `newname'"
				// get the first letter of each word and formulate variable name 
				local n_words : word count `newname'
				local newname2 = ""
				forv i = 1(1)`n_words' {
					local name `:word `i' of `newname''
					local name = upper(substr("`name'",1,1))    // (1,1) take the first letter 
					local newname2 = "`newname2'" + "`name'"
				}
				
				local newname2 = "`newname2'"  + "_`last_word'"			

				di "`newname2'"
				rename `var' `newname2'
				label var `newname2' "`newname'"
			}
			
			drop if _n <= `namerange'

			foreach var of varlist _all {
				if strpos("`droplist'","`var' ") != 0  {
					capture drop `var'
				}
			}
			
			if "`pivot'" != "" {
				//get the list of variables to be reshaped 
				local reshapeVarList = ""
				local reshapeLabelList = ""
				local idVarList = ""
				
				tostring *, replace // double check all variables are recorded in string format
				foreach var of varlist _all {
					replace `var' = "." if `var' == "n.a."
					
					if regexm("`var'","[0-9]+[a-zA-Z]?$") {
						local name = regexr("`var'","[0-9]+[a-zA-Z]?$","")   //if variable name contains numbers, delete numbers
						if strpos("`reshapeVarList'"," `name' ") == 0  {
							local label_`name' : variable label `var'
							local reshapeVarList "`reshapeVarList' `name' "
						}
					}
					else {
						if regexm("`var'","id[0-9]+_$") == 0 {
							local idVarList "`idVarList' `var'"
						}
					}
					
				}
			
				noi di "ID Vars: `idVarList'"
				noi di "Reshape Vars: `reshapeVarList'"
				reshape long `reshapeVarList', i(`idVarList') j(time) `reshape_opt'
				compress 
				
				// Recorver Labels 
				foreach var of local reshapeVarList {
					label var `var' "`label_`var''"
				}
			}
			
		foreach var of varlist _all {
			local refinedname = subinstr("`var'","_","",.)
			rename `var' `refinedname'
		}		
		
		local file_name = subinstr("`f'",".`extension'",".dta",.)
		if "`integrate'" == "" {
			destring *, replace 				
		}		
		save "`resultdir'//`file_name'", replace
		
		if "`integrate'" != "" {
			append using `building'
			save `building', replace
		}
		}
	}
	
	if "`integrate'" != "" {
		qui destring *, replace 				
		qui save "`resultdir'//clean.dta", replace 
	}
	
	qui cd ..

end