program define reclink
version 8.2
*! v 1.7 14-Jan-2010 M. Blasnik:  Record Linkage
*! 1.7 fixes typo in winkler adjustment to bigram calculation that prevented the adjustment
*! 1.6 fixes file paths with spaces -- encloses all file references in double quotes
*! 1.5 fixes bug introduced in second release and orblock none bug
syntax varlist using/ ,  IDMaster(str) IDUsing(str) Gen(str) [WMatch(str) WNOMatch(str) EXClude(str) /* 
*/   UVarlist(str) MINScore(real 0.6) MINBigram(real 0.6) ORBLock(str) REQuired(str) /*
*/ _Merge(str) UPRefix(str) EXActstr(str) DEBug(str) ]

if "`_merge'"=="" local _merge "_merge"
cap confirm variable `_merge'
if _rc==0 {
	di as err "`_merge' already exists"
	exit 198
}

local nvars: word count `varlist'
if "`wmatch'"=="" local wmatch: di _dup(`nvars')  " 1 "
if "`wnomatch'"=="" local wnomatch "`wmatch'"
if "`uprefix'"=="" local uprefix "U"
foreach val in `wmatch' `wmnomatch'  {
	if `val'<1 {
		di as error " weights must be >=1"
		exit 198
	}
}

local nwms:  word count `wmatch'
local nwnms: word count `wnomatch'
if `nvars'!=`nwnms' | `nvars'!=`nwms'  {
	di as error "wmatch and wnomatch must each have the same # of elements as varlist"
	exit 198
}

if "`uvarlist'"=="" local uvarlist "`varlist'"
local nuvars: word count `uvarlist'
if `nvars'!=`nuvars' {
	di as error "uvarlist must have the same # of elements as varlist"
	exit 198
}

if `:list required in varlist'==0 {
		di as error "all variables in required() option must also be in the main varlist"
		exit 198
}
	
if "`orblock'"!="none" & `:list orblock in varlist'==0 {
		di as error "all variables in orblock() option must also be in the main varlist"
		exit 198
}

if "`orblock'"=="" & `nvars'>3 local orblock "`varlist'"

local i=1
foreach var of local varlist {
	local v`i' "`var'"
  local wm`i': word `i' of `wmatch'
	local wnm`i': word `i' of `wnomatch'
	local uv`i': word `i' of `uvarlist'
	cap confirm string var `var'
	local string`i'=(_rc==0)
	local bigram`i': list var in exactstr
	local bigram`i'=1-`bigram`i''
	local inreq`i': list var in required
	local i=`i'+1
}

* scaling factor for bigram score mapping to matchscore 
local biscale=.5/(1-`minbigram')^(1/3)

preserve

quietly {
isid `idmaster'
gen `gen'=0
label var `gen' "reclink matching score"
local mobs=c(N)
tempfile m u matched  idusingmatched idmastermatched 
if "`debug'"=="" tempfile results
else local results "`debug'"

keep `idmaster' `gen' `varlist'
sort `idmaster'
local idmtype: type `idmaster'
save "`m'"

if "`exclude'"!="" {
	tempfile midexclude uidexclude using2
	tempvar mrgtmp
	use `idmaster' `idusing' using "`exclude'"
	drop if missing(`idmaster') | missing(`idusing')
	save "`midexclude'"
	
	bysort `idusing': keep if _n==1
	keep `idusing'
	sort `idusing'
	save "`uidexclude'"
	
	use "`midexclude'"
	bysort `idmaster': keep if _n==1
	keep `idmaster'
	sort `idmaster'
	merge `idmaster' using "`m'", _merge(`mrgtmp')
	keep if `mrgtmp'==2
	drop `mrgtmp'	
	sort `idmaster'
	local mobs2match=c(N)
	save "`m'", replace

	use "`using'"
	local uobs=c(N)
	sort `idusing'
	merge `idusing' using "`uidexclude'", _merge(`mrgtmp') nokeep
	keep if `mrgtmp'==1
	drop `mrgtmp'
	save "`using2'"
}
else {
	local using2 "`using'"
}

use "`using2'"
local uobs2match=c(N)
if "`uobs'"=="" local uobs=c(N)
isid `idusing'
keep `idusing' `uvarlist'
local i=1
foreach var of varlist `uvarlist' {
	if "`var'"!="`v`i''" {
		cap rename `var' `v`i''
	}
	local i=`i'+1
}
sort `varlist'
local idutype: type `idusing'
tempvar uidtmp
egen `uidtmp'=group(`varlist'), missing
save "`u'"

* create lookup table back to original id variable in using dataset
keep `uidtmp' `idusing'
tempfile uids
sort `uidtmp'
save "`uids'"

* now collapse the using dataset down to unique combinations of varlist
use "`u'"
drop `idusing'
bysort `uidtmp' : keep if _n==1
sort `varlist' 
save "`u'", replace
local uniqusingn=c(N)

* identify perfect matches
use "`m'"
sort `varlist'
merge `varlist' using "`u'", nokeep
keep if _merge==3
local perfectn=c(N)
noi di as result _n "`perfectn' perfect matches found" _n

if `perfectn'>0 {
	replace `gen'=1
	keep `idmaster' `uidtmp' `gen'
	sort `idmaster'
	save "`matched'"

	* save using ids for perfect matches
	keep `uidtmp'  
	sort `uidtmp'
	save "`idusingmatched'"

	* save master ids for perfect matches	
	use "`matched'"
	keep `idmaster'
	sort `idmaster'
	save "`idmastermatched'"

	* get rid of perfect matches from using datafile 
	use "`u'"
	sort `uidtmp'
	merge `uidtmp' using  "`idusingmatched'"
	keep if _merge==1
	drop _merge
	save "`u'", replace

	* get rid of perfect matches from master data file
	use "`m'"
	sort `idmaster'
	merge `idmaster' using  "`idmastermatched'"
	keep if _merge==1
	drop _merge
	save "`m'", replace
	
}
	
else {
	use "`m'", replace
}

local tomatch=c(N)
tempname post
postfile `post' `idmtype' `idmaster' long `uidtmp' float `gen' using "`results'"
if `tomatch'>1000 {
noi di as text "Going through `tomatch' observation to assess fuzzy matches, each .=5% complete"
}
forvalues obs=1(1)`tomatch' {
		use "`m'" in `obs', replace

  if int(20*`obs'/`tomatch')!=int(20*(`obs'-1)/`tomatch') & (`tomatch'>1000) noi di "." _c
	local ifall
	local ifany
	local ifreq

* OR blocking
	local orblock : list orblock - required
	if lower("`orblock'")!="none" & "`orblock'"!="" {
		local i=1
		foreach v of local orblock {
			local thisval=`v'[1]
			if "`thisval'"!="" {
				if `"`ifany'"'!="" {
					local ifany `" `ifany' | "'
				}
				cap confirm string var `v'
				if _rc==0 {
					local ifany `" `ifany' `v'=="`thisval'" "'
				}
				else {
					local ifany `" `ifany' `v'==`thisval' "'
				}
			}
			local i=`i'+1
		}
	local ifall `" if (`ifany') "'
	} /* end of OR blocking */
	
	* now add required matches
	if "`required'"!="" {
		local i=1
		foreach v of local required {
			local thisval=`v'[1]
			if `"`ifreq'"'!="" {
				local ifreq `" `ifreq' & "'
			}
			cap confirm string var `v'
			if _rc==0 {
				local ifreq `" `ifreq' `v'=="`thisval'" "'
			}
			 else {
			 	local ifreq `" `ifreq' `v'==`thisval' "'
			}
			local i=`i'+1
		}
		if `"`ifall'"'!="" local ifall `" `ifall' & (`ifreq') "'
		 else local ifall `" if (`ifreq') "'
	} /* end of required matches */
		
	local thisidmaster=`idmaster'[1]
	local i=1
	foreach v of local varlist {
			local thisv`i'=`v'[1]
			local i=`i'+1
	}
		
* still need to go thru varlist to drop single quotes that cause macro problems?

	* load using data which meets required and orblock 
	use "`u'"  `ifall', replace
	local upossn=c(N)
	if `upossn'>0 {

* scale bigram matches from minscore to 1 into 0->1
		tempvar M NM score bi
		gen `M'=0
		gen `NM'=0
		local i=1
		foreach v of local varlist {
				if "`thisv`i''"!="" {
					if `inreq`i''!=1 {
						if `string`i''==1 & `bigram`i''==1 {
							bigram1 `v', match(`thisv`i'') gen(`bi') 
						}
						else {
							cap drop `bi'
							if `string`i''==1 gen `bi'=(`v'=="`thisv`i''")
							else gen `bi'=(`v'==`thisv`i'')
						}
					replace `M'=`M'+`wm`i''*(.5*`bi'^2+cond(`bi'>`minbigram',`biscale'*(`bi'-`minbigram')^(1/3),0))
					replace `NM'=`NM'+`wnm`i''*cond(`bi'<(`minbigram'-.2),1,cond(`bi'<`minbigram',1-`bi'^2,(1-`bi')^2)) if !missing(`v') 
					}
					else {
						replace `M'=`M'+`wm`i''
					}
			}
				* missing in one, not other = 30% non-match
				 replace `NM'=`NM'+`wnm`i''*0.3 if (missing("`thisv`i''") + missing(`v')) ==1
			local i=`i'+1
		} 
		
		gen `score'=`M'/(`M'+`NM')
		
		replace `score'=0 if `score'==.
		sort `score'
		* now keep the cases with the highest score and post them
		keep if `score'==`score'[_N] & `score'>`minscore'
		local matches=c(N)
		if `matches'>0 {
			local scoreval=`score'[1]
			forval j=1(1)`matches' {
				local thisidusing = `uidtmp'[`j']
				post `post' (`thisidmaster') (`thisidusing') (`scoreval') 
			}
		}
	} 
}
postclose `post'

* now merge the matching back into master 
use "`results'", replace
local resobs=c(N)
if `perfectn'>0 {
	append using "`matched'"
}
tempvar tmrg
sort `uidtmp'
merge `uidtmp' using "`uids'", nokeep _merge(`tmrg')
assert `tmrg'==3
drop `tmrg' `uidtmp'
sort `idmaster'
save, replace

* clean up using data -- make sure no varnames in common so mismatches can be examined
use "`using'", replace
forvalues i=1(1)`nvars' {
	cap confirm var `v`i'' 
	if _rc==0 rename `v`i'' `uprefix'`v`i'' 
}
tempfile u2
sort `idusing'
save "`u2'"

* now back to original dataset and merge results into that, then using dataset into that
restore
sort `idmaster'
merge `idmaster' using "`results'", _merge(`tmrg')
assert `tmrg'!=2
sort `idusing'
merge `idusing' using "`u2'" , nokeep _merge(`_merge')
local fobs=c(N)
qui count if missing(`idusing')
local nomatch=r(N)
forvalues i=1(1)`nvars' {
	cap move  `uprefix'`v`i'' `v`i''  
	cap move  `v`i'' `uprefix'`v`i''   
}
} /* end of quietly block */

format `gen' %5.4f
noi di _n "Added: `idusing'= identifier from `using'   `gen' = matching score"
noi di as res "Observations:  Master N = `mobs'    `using' N= `uobs' "
if "`exclude'"!="" {
	noi di as res "  # Obs after excluding matches in `exclude':  Master = `mobs2match'    `using': `uobs2match' "
}
noi di as res "  Unique Master Cases: matched = " `mobs'-`nomatch'   " (exact = `perfectn'), unmatched = `nomatch'"
end


program define bigram1
version 8.2
*! version 2.2 M Blasnik 19-Sep-2005
syntax varlist (max=1) , gen(str) match(str) 
tempvar slen wink
cap confirm var `gen'
if _rc==0 replace `gen'=0
else gen `gen'=0
local mlen=length("`match'")
gen `slen'=length(`varlist')
local poss=`mlen'-1
if `mlen'>2 {
	forval i=1(1)`poss' {
			qui replace `gen'=`gen'+1 if index(`varlist',substr("`match'",`i',2))>0
	}
	qui replace `gen'=`gen'*2/(`slen'+`mlen'-2)
}

* deal with strings <3 characters
	qui replace `gen'=0 if (`mlen'<3 | `slen'<3) & !index("`match'",`varlist') & !index(`varlist',"`match'") 
  qui replace `gen'=min(`mlen',`slen')/(`mlen'+`slen'-1)  if (`mlen'<3 | `slen'<3) & ( index("`match'",`varlist') | index(`varlist',"`match'") )  	

	* Winkler adjustment: adjusts score upward based on first 1,2,3, or 4 characters matching
	gen byte `wink'=0
	forval i=1(1)4 {
			qui replace `wink'=`wink'+1 if substr(`varlist',1,`i')==substr("`match'",1,`i')
	}
	qui replace `gen'=`gen'+`wink'*(1-`gen')/10
  * make sure exact matches =1 and missing =0 
  qui replace `gen'=1 if `varlist'=="`match'"
	qui replace `gen'=0 if `slen'==0
end