*! opencagegeo version 1.2.0 (22/02/2018) *! Lars Zeigermannn * opencagegeo uses the OpenCage Geocoder API. It reuses some code segments of * geocode written by Adam Ozimek and Daniel Miles. opencagegeo requires * insheetjson and libjson written by Erik Lindsley. * Version 1.1.0: a bug affecting users of paid keys has been fixed and a * and the paidkey option was added * Version 1.2.0: opencagegeo no longer requires the paidkey option program opencagegeo version 12.1 syntax [in] [if], /// [ /// key(str) /// NUMber(varname) STReet(varname str) POSTcode(varname) /// city(varname str) county(varname str) state(varname str) /// country(varname str) FULLaddress(varname str) /// COORDinates(varname str) LATitude(varname) /// LONgitude(varname) countrycode(str) LANGuage(str) /// replace RESume PAIDkey /// ] *** Mark sample marksample touse, novarlist qui count if `touse' == 1 local todo = `r(N)' *** Generate tempvar sorder tempvar sorder gen `sorder' = _n *** Check that _N > 0 cap assert _N > 0 if _rc!= 0 { di as err "No observations" exit 2000 } *** Check for insheetjson and libsjson cap which insheetjson if _rc == 111 di as err "Insheetjson.ado not found, please ssc install insheetjson" cap which libjson.mlib if _rc == 111 di as err "Libjson.mlib not found, please ssc install libjson" qui if _rc == 111 assert 1 == 2 *** Check that replace and resume are not combined if ("`resume'" != "") { if ("`replace'" != "") { di as err "Options replace and resume may not be combined" exit 184 } cap confirm variable g_quality if _rc != 0 { di as err "Variable g_quality not found, cannot resume geocoding" exit 499 } * Reset local touse qui count if `touse' == 1 & g_quality != . local geocoded = `r(N)' + 1 * Adjust touse if resume qui replace `touse' = 0 if g_quality != . } else { local geocoded = 1 } if ("`replace'" != "") { cap replace g_quality = . if `touse' } *** Check that touse is not zero for all observations qui sum `touse' if `r(mean)' == 0 { di as err "Nothing to geocode" exit 2000 } *** Generate language tempvar (if specified) local langvar = 0 if ("`language'" != "") { cap confirm variable `language' if _rc == 0 { tempvar languageresponse qui gen `languageresponse' = `language' local langvar = 1 } } else { local language = "en" } *** Generate countrycode tempvar (if specified local countrycodevar = 0 if ("`countrycode'" != "") { local iso_3166 AD AE AF AG AI AL AM AO AQ AR AS AT AU AW AX AZ BA /// BB BD BE BF BG BH BI BJ BL BM BN BO BQ BR BS BT BV BW /// BY BZ CA CC CD CF CG CH CI CK CL CM CN CO CR CU CV CW /// CX CY CZ DE DJ DK DM DO DZ EC EE EG EH ER ES ET FI FJ /// FK FM FO FR GA GB GD GE GF GG GH GI GL GM GN GP GQ GR /// GS GT GU GW GY HK HM HN HR HT HU ID IE IL IM IN IO IQ /// IR IS IT JE JM JO JP KE KG KH KI KM KN KP KR KW KY KZ /// LA LB LC LI LK LR LS LT LU LV LY MA MC MD ME MF MG MH /// MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC /// NE NF NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL /// PM PN PR PS PT PW PY QA RE RO RS RU RW SA SB SC SD SE /// SG SH SI SJ SK SL SM SN SO SR SS ST SV SX SY SZ TC TD /// TF TG TH TJ TK TL TM TN TO TR TT TV TW TZ UA UG UM US /// UY UZ VA VC VE VG VI VN VU WF WS YE YT ZA ZM ZW cap confirm variable `countrycode' if _rc == 0 { tempvar countrycodequery qui gen `countrycodequery' = `countrycode' local countrycodevar = 1 tempvar iso_check qui gen `iso_check' = 0 foreach i of local iso_3166 { qui replace `iso_check' = 1 if upper(`countrycodequery') == "`i'" } cap assert `iso_check' == 1 if `countrycodequery' != "" if _rc == 9 { qui levelsof `countrycodequery' if `iso_check' == 0, local(levels) di as err "Following country codes are invalid: "`r(levels)'"" exit 9 } } else { local continue = 0 foreach i of local iso_3166 { cap assert upper("`countrycode'") == "`i'" if _rc == 0{ local continue = 1 continue, break } } if (`continue' == 0) { di as err "Country code `countrycode' is not valid" exit 9 } } } *** Check that key was provided if ("`key'" == "") { di as err "Opencagedata API key required: Sign up for a free key at www.opencagedata.com" exit 198 } *** Check that no options are combined inconsistently if ("`fulladdress'" != "") { cap assert "`coordinates'" == "" if _rc != 0 { di as err "Options fulladdress and coordinates may not be combined" exit 184 } cap assert "`latitude'" == "" if _rc != 0 { di as err "Options fulladdress and latitude not be combined" exit 184 } cap assert "`longitude'" == "" if _rc != 0 { di as err "Options fulladdress and longitude not be combined" exit 184 } local type fulladdress } if ("`coordinates'" != "") { cap assert "`latitude'" == "" if _rc != 0 { di as err "Options coordinates and latitude may not be combined" exit 184 } cap assert "`longitude'" == "" if _rc != 0 { di as err "Options coordinates and longitude may not be combined" exit 184 } cap assert regexm(`coordinates',",")==1 if _rc != 0 { di as err "Latitudes and longitudes must be separated by a comma" exit 499 } cap assert length(`coordinates') == (length(subinstr(`coordinates',",","",.)) +1) if _rc != 0{ di as err "Variable fed into coordinates option may only contain one comma" exit 499 } tempvar latcheck qui gen `latcheck' = real(substr(`coordinates',1,strpos(`coordinates',",")-1)) cap assert `latcheck' != . if _rc != 0 { di as err "Latitudes may not contain non-numeric characters" exit 499 } cap assert abs(`latcheck') <= 90 if _rc != 0 { di as err "Latitudes must take values between -90 and 90" exit 499 } tempvar loncheck qui gen `loncheck' = real(substr(`coordinates',strpos(`coordinates',",")+1,.)) cap assert `loncheck' != . if _rc != 0 { di as err "Longitudes may not contain non-numeric characters" exit 499 } cap assert abs(`loncheck') <= 180 if _rc != 0 { di as err "Longitudes must take values between -180 and 180" exit 499 } local type coordinates } if ("`latitude'" != "" | "`longitude'" != "") { cap assert "`latitude'" != "" if _rc != 0 { di as err "The longitude option must be specified together with latitude" exit 499 } cap assert "`longitude'" != "" if _rc != 0 { di as err "The latitude option must be specified together with longitude" exit 499 } cap assert "`longitude'" == "`latitude'" if _rc == 0 { di "Warning: same variable specified for latitude and longitude" } * Check that latitude values are between -90 and 90 tempvar templat tempvar latcheck cap confirm numeric variable `latitude' if _rc == 7 { qui gen `templat' = `latitude' qui gen `latcheck' = real(`latitude') cap assert `latcheck' != . if _rc != 0 { di as err "Latitudes may not contain non-numeric characters" exit 499 } } else { qui gen `templat' = string(`latitude') qui gen `latcheck' = `latitude' } cap assert abs(`latcheck') <= 90 if _rc != 0 { di as err "Latitudes must take values between -90 and 90" exit 499 } * Check that longitude values are between -180 and 180 tempvar templon tempvar loncheck cap confirm numeric variable `longitude' if _rc == 7 { qui gen `templon' = `longitude' qui gen `loncheck' = real(`longitude') cap assert `loncheck' != . if _rc != 0 { di as err "Longitudes may not contain non-numeric characters" exit 499 } } else { qui gen `templon' = string(`longitude') qui gen `loncheck' = `longitude' } cap assert abs(`loncheck') <= 180 if _rc != 0 { di as err "Longitudes must take values between -180 and 180" exit 499 } * Insert leading zeros for values below absolute 1 (required by the Opencage Geocoder) qui replace `templat' = "0" + `templat' if strpos(`templat',".") == 1 qui replace `templat' = subinstr(`templat',"-.","-0.",.) qui replace `templon' = "0" + `templon' if strpos(`templon',".") == 1 qui replace `templon' = subinstr(`templon',"-.","-0.",.) local type latlon } if "`type'" != "" { local address_parts "number street postcode city county state country" foreach i of local address_parts { capture assert "``i''" == "" if _rc != 0 { di as err "Options `type' and `i' may not be combined" exit 184 } } } else { if "`street'" == "" & "`number'" == "" & "`postcode'" == "" & /// "`city'" == "" & "`county'" == "" & "`country'" == "" { di as err "No location specified" exit 498 } local type address } quietly { *** Generate tempvar work containing the location tempvar work if ("`type'" == "fulladdress") { gen `work' = " " + `fulladdress' if `touse' } if ("`type'" == "address") { tempvar blank tempvar tempnumber tempvar temppostcode if ("`number'" != "") { cap confirm numeric variable `number' if _rc == 0 { qui tostring `number', gen(`tempnumber') qui replace `tempnumber' = "" if `tempnumber' == "." } else if _rc == 7 { gen `tempnumber' = `number' } } if ("`postcode'" != "") { cap confirm numeric variable `postcode' if _rc == 0 { qui tostring `postcode', gen(`temppostcode') qui replace `temppostcode' = "" if `temppostcode' == "." } else if _rc == 7 { gen `temppostcode' = `postcode' } } gen `blank' = "" gen `work' = "" if ("`number'" != "") replace `work' = `work' + `tempnumber' if `touse' & `tempnumber' != "" if ("`street'" != "") replace `work' = `work' + " " + `street' if `touse' & `street' != "" if ("`postcode'" != "") replace `work' = `work' + "%2C" + `temppostcode' if `touse' & `work' != "" & `temppostcode' != "" if ("`city'" != "" & "`postcode'" != "") replace `work' = `work' +" " + `city' if `touse' & `city' != "" if ("`city'" != "" & "`postcode'" == "") { replace `work' = `work' + "%2C" if `touse' & `city' != "" & `work' != "" replace `work' = `work' + `city' if `touse' & `city' != "" } if ("`county'" != ""){ replace `work' = `work' + "%2C" if `touse' & `work' != "" & `county' != "" replace `work' = `work' + `county' if `touse' } if ("`state'" != "") { replace `work' = `work' + "%2C" if `touse' & `work' != "" & `state' != "" replace `work' = `work' + `state' if `touse' } if ("`country'" != "") { replace `work' = `work' + "%2C" if `touse' & `work' != "" & `country' != "" replace `work' = `work' + `country' if `touse' } } if ("`type'" == "coordinates") { gen `work' = `coordinates' if `touse' } if ("`type'" == "latlon") { gen `work' = `templat' + "%2C" + `templon' if `touse' } *** Generate local containing all var names to be filled in local tobefilled g_lat g_lon g_country g_state g_county g_city /// g_postcode g_street g_number g_confidence g_formatted *** Replace observations if replace option specified cap confirm new var `tobefilled' local needreplace = _rc if ("`replace'" != "" | "`resume'" != "" | `needreplace' == 0) { foreach var of local tobefilled{ cap gen str224 `var' = "" if ("`replace'" != "") cap replace `var' = "" if `touse' } cap recast str224 `tobefilled' } else { foreach var of local tobefilled{ cap confirm variable `var' if !_rc { noi di as err "`var' does already exist." } } cap confirm variable g_quality if !_rc { noi di as err "g_quality does already exist." } noi di as err "Drop above variables or use replace option." exit 110 } *** Generate tempvars tempvar g_town g_village g_hamlet g_street_name g_road g_footway /// g_residential g_pedestrian g_code cap gen str224 `g_town' = "" cap gen str224 `g_village' = "" cap gen str224 `g_hamlet' = "" cap gen str224 `g_street_name' = "" cap gen str224 `g_road' = "" cap gen str224 `g_footway' = "" cap gen str224 `g_residential' = "" cap gen str224 `g_pedestrian' = "" cap gen str224 `g_code' = "" *** Clean up tempvar work to avoid problems when sending the query * Change some common address formats causing errors replace `work' = lower(`work') replace `work' = subinstr(`work',"&","%26",.) if `touse' replace `work' = subinstr(`work',"#","",.) if `touse' replace `work' = subinstr(`work'," 01st"," 1st",.) if `touse' replace `work' = subinstr(`work'," 02nd"," 2nd",.) if `touse' replace `work' = subinstr(`work'," 03rd"," 3rd",.) if `touse' replace `work' = subinstr(`work'," 04th"," 4th",.) if `touse' replace `work' = subinstr(`work'," 05th"," 5th",.) if `touse' replace `work' = subinstr(`work'," 06th"," 6th",.) if `touse' replace `work' = subinstr(`work'," 07th"," 7th",.) if `touse' replace `work' = subinstr(`work'," 08th"," 8th",.) if `touse' replace `work' = subinstr(`work'," 09th"," 9th",.) if `touse' replace `work' = subinstr(`work',`"""'," ",.) if `touse' * Remove multiple blanks replace `work' = itrim(`work') * Remove blanks after commas replace `work' = subinstr(strtrim(`work'), ", ", ",",.) * Remove leading and trailing blanks and replace interior blanks with + replace `work' = subinstr(strtrim(`work'), " ", "+",.) *** Check tempvar work for special characters (Stata 13 or older) or encode (if Stata 14 or newer) if c(stata_version) >= 14 { replace `work' = ustrto(`work', "ascii", 4) replace `work' = subinstr(`work',"\","%",.) } else { foreach num of numlist 1/31 127/255 { cap assert index(`work',char(`num')) == 0 if _rc != 0 { di as err "Location names may not contain special characters" exit 499 } } } *** Generate local containing column selectors local selectors results:1:geometry:lat results:1:geometry:lng /// results:1:components:country results:1:components:state /// results:1:components:county results:1:components:city /// results:1:components:postcode results:1:components:street /// results:1:components:house_number results:1:confidence /// results:1:formatted results:1:components:town /// results:1:components:village results:1:components:hamlet /// results:1:components:street_name results:1:components:road /// results:1:components:footway results:1:components:residential /// results:1:components:pedestrian status:code *** Order data set for geocoding sort `touse' `sorder' *** Generate locals for loop local cnt = _N count if `touse' == 0 local start = `r(N)' + 1 *** Loop over observations to be geocoded forval i = `start'/`cnt' { cap { local offset = `i'-1 local query = `work'[`i'] if (`countrycodevar' == 1) local countrycode = `countrycodequery'[`i'] if (`langvar' == 1) local language = `languageresponse'[`i'] insheetjson `tobefilled' `g_town' `g_village' `g_hamlet' `g_street_name' `g_road' /// `g_footway' `g_residential' `g_pedestrian' `g_code' /// using "http://api.opencagedata.com/geocode/v1/json?q=`query'&key=`key'&no_annotations=1&language=`language'&countrycode=`countrycode'&limit=1", col(`selectors') flatten limit(1) offset(`offset') replace if (`g_code'[`i'] != "200") { if (`i' == `start') { local error = 111 } else { local error = 112 } qui replace `touse' = 0 if _n >= `i' continue, break } noi di "OpenCage geocoded `geocoded' of `todo'" local ++geocoded } if _rc!=0 { local error = _rc qui replace `touse' = 0 if _n >= `i' continue, break } } *** Generate unique street variable cap replace g_street = g_street + `g_street_name' + `g_road' + /// `g_pedestrian' + `g_residential' if (missing(g_street) | g_street == "[]") & `touse' *** Generate unique city variable cap replace g_city = g_city + `g_town' + `g_village' + `g_hamlet' /// if (missing(g_city) | g_city == "[]") & `touse' *** Compress variables and convert to UTF-8 if Stata 14 foreach var of local tobefilled { if c(stata_version) >= 14 { replace `var' = ustrunescape(`var') if `touse' } replace `var' = subinstr(`var',"[]","",.) if `touse' } cap compress `tobefilled' sort `sorder' *** Generate g_quality variable cap gen g_quality = . replace g_quality = 0 if `touse' replace g_quality = 1 if g_country != "" & `touse' replace g_quality = 2 if g_state != "" & `touse' replace g_quality = 3 if g_county != "" & `touse' replace g_quality = 4 if g_city != "" & `touse' replace g_quality = 5 if g_postcode != "" & `touse' replace g_quality = 6 if g_street != "" & `touse' replace g_quality = 7 if g_number != "" & `touse' *** Define labels and label values of g_quality cap label define quality 0 "not found" 1 "country" 2 "state" /// 3 "county" 4 "city" 5 "postcode" 6 "street" 7 "number" label values g_quality quality *** Exit if error occured if ("`error'" != "") { if (`error' == 111) { noi di as err "No observations geocoded: Invalid key, rate limit exceeded or no internet connection" exit } else if (`error' == 112) { noi di as err "Rate limit exceeded or internet connection failed" exit } exit `error' } *** Display summary table of g_quality if ("`resume'" != "") marksample touse, novarlist noi tab g_quality if `touse' noi di _newline "Data generated is jointly licensed under the ODbL and CC-BY-SA licenses" } end