*******************************************************************************
* _unicefdata_sync_ind_meta
*! v 1.2.0   16Jan2026               by Joao Pedro Azevedo (UNICEF)
* Helper program for unicefdata_sync: Sync full indicator catalog
*
* Uses unicefdata_xmltoyaml (Python backend) to handle the large XML file
* that exceeds Stata's macro length limits when parsed inline.
*******************************************************************************

program define _unicefdata_sync_ind_meta, rclass
    syntax, OUTFILE(string) AGENCY(string) [FORCE FORCEPYTHON FORCESTATA ENRICHDATAFLOWS FALLBACKSEQUENCESOUT(string)]
    
    local cache_max_age_days = 30
    local codelist_url "https://sdmx.data.unicef.org/ws/public/sdmxapi/rest/codelist/UNICEF/CL_UNICEF_INDICATOR/1.0"
    
    * Get timestamp
    local synced_at : di %tcCCYY-NN-DD!THH:MM:SS clock("`c(current_date)' `c(current_time)'", "DMYhms")
    local synced_at = trim("`synced_at'") + "Z"
    
    *---------------------------------------------------------------------------
    * Staleness check: skip if file exists and is less than 30 days old
    *---------------------------------------------------------------------------
    if ("`force'" == "") {
        capture confirm file "`outfile'"
        if (_rc == 0) {
            * File exists - check its age using file modification date
            quietly {
                local finfo : dir "." files "`outfile'"
                if (`"`finfo'"' != "") {
                    tempname fh_check
                    capture file open `fh_check' using "`outfile'", read
                    if (_rc == 0) {
                        * Read first few lines to check for synced_at/last_updated date
                        local found_date = 0
                        local line_count = 0
                        file read `fh_check' line
                        while !r(eof) & `line_count' < 20 {
                            local line_count = `line_count' + 1
                            * Check for both synced_at and last_updated (Python/R use last_updated)
                            if (strmatch(`"`line'"', "*synced_at:*") | strmatch(`"`line'"', "*last_updated:*")) {
                                * Extract date from timestamp field
                                local synced_str = regexr(`"`line'"', ".*(synced_at|last_updated): *'?", "")
                                local synced_str = regexr("`synced_str'", "'.*", "")
                                local synced_str = substr("`synced_str'", 1, 10)
                                * Parse YYYY-MM-DD format
                                capture {
                                    local sync_year = real(substr("`synced_str'", 1, 4))
                                    local sync_month = real(substr("`synced_str'", 6, 2))
                                    local sync_day = real(substr("`synced_str'", 9, 2))
                                    local sync_date = mdy(`sync_month', `sync_day', `sync_year')
                                    local today_date = date("`c(current_date)'", "DMY")
                                    local file_age = `today_date' - `sync_date'
                                    local found_date = 1
                                }
                                continue, break
                            }
                            file read `fh_check' line
                        }
                        file close `fh_check'
                        
                        if (`found_date' == 1 & `file_age' < `cache_max_age_days') {
                            * File is fresh enough - count existing indicators and return
                            local n_cached = 0
                            tempname infh
                            capture file open `infh' using "`outfile'", read
                            if (_rc == 0) {
                                file read `infh' line
                                while !r(eof) {
                                    if (strmatch(`"`line'"', "  *:") & !strmatch(`"`line'"', "    *")) {
                                        local n_cached = `n_cached' + 1
                                    }
                                    file read `infh' line
                                }
                                file close `infh'
                            }
                            * Subtract 1 for metadata entry
                            local n_cached = `n_cached' - 1
                            di as text "     → Using cached file (`file_age' days old, threshold: `cache_max_age_days' days)"
                            return scalar count = `n_cached'
                            return scalar cached = 1
                            exit
                        }
                    }
                }
            }
        }
    }
    
    *---------------------------------------------------------------------------
    * Fetch XML from API
    *---------------------------------------------------------------------------
    local base_url "https://sdmx.data.unicef.org/ws/public/sdmxapi/rest"
    local url "`base_url'/codelist/`agency'/CL_UNICEF_INDICATOR/latest"
    
    tempfile xmlfile
    capture copy "`url'" "`xmlfile'", public replace
    
    if (_rc != 0) {
        di as err "     Failed to download indicator codelist from API"
        return scalar count = 0
        return scalar cached = 0
        exit
    }
    
    *---------------------------------------------------------------------------
    * Use Python-based unicefdata_xmltoyaml for robust parsing
    * This avoids Stata's macro length limitation with large XML files
    *---------------------------------------------------------------------------
    
    * Determine parser option (default to forcepython for large indicator files)
    local parser_option "forcepython"
    if ("`forcestata'" != "") {
        local parser_option "forcestata"
    }
    else if ("`forcepython'" != "") {
        local parser_option "forcepython"
    }
    
    * Build fallback option if specified
    local fallback_opt ""
    if (`"`fallbacksequencesout'"' != "") {
        local fallback_opt `"fallbacksequencesout("`fallbacksequencesout'")"'
    }
    
    capture noisily unicefdata_xmltoyaml, ///
        type(indicators) ///
        xmlfile("`xmlfile'") ///
        outfile("`outfile'") ///
        agency("`agency'") ///
        version("1.0") ///
        source("`codelist_url'") ///
        codelistid("CL_UNICEF_INDICATOR") ///
        codelistname("UNICEF Indicator Codelist") ///
        `parser_option' `enrichdataflows' `fallback_opt'
    
    if (_rc == 0) {
        local n_indicators = r(count)
        return scalar count = `n_indicators'
        return scalar cached = 0
        exit
    }
    
    * Python failed - report error (cannot fall back to Stata for this large file)
    di as err "     Python parser required for indicator metadata (file too large for Stata)"
    di as err "     Ensure Python 3.6+ is installed and unicefdata_xml2yaml.py is accessible"
    return scalar count = 0
    return scalar cached = 0
    error 601
end