*******************************************************************************
* unicefdata_xmltoyaml_py
*! v 1.2.0   16Jan2026               by Joao Pedro Azevedo (UNICEF)
* SDMX XML to YAML converter using Python backend
* Handles large XML files that Stata cannot process natively
* v1.2.0: Show enrichment progress inline instead of capturing to temp file
* v1.1.0: Added ENRICHDATAFLOWS option to add dataflow info to indicators
* v1.0.1: Fixed adopath search to use actual sysdir paths
*******************************************************************************

/*
DESCRIPTION:
    Converts SDMX XML files to YAML format using a Python helper script.
    This approach handles arbitrarily large XML files without hitting
    Stata's line-length limitations.
    
REQUIREMENTS:
    - Python 3.6+ installed and accessible via 'python' command
    - requests package required for ENRICHDATAFLOWS option (pip install requests)
    - lxml package recommended (pip install lxml) for better performance
    - Falls back to xml.etree.ElementTree if lxml not available
    
SUPPORTED TYPES:
    dataflows   - SDMX dataflow definitions
    codelists   - Generic codelist items
    countries   - Country codes (CL_COUNTRY)
    regions     - Regional codes (CL_WORLD_REGIONS)
    dimensions  - DSD dimension definitions
    attributes  - DSD attribute definitions
    indicators  - Indicator codelist (CL_UNICEF_INDICATOR)
    
SYNTAX:
    unicefdata_xmltoyaml_py, type(string) xmlfile(string) outfile(string)
        [agency(string) version(string) source(string) 
         codelistid(string) codelistname(string) enrichdataflows
         fallbacksequencesout(string)]
    
OPTIONS:
    enrichdataflows  - For indicators: query all dataflows to add 'dataflows'
                       field to each indicator (takes ~1-2 minutes)
    fallbacksequencesout(string) - Also generate fallback sequences YAML
                       to this path (only with enrichdataflows)
    
RETURNS:
    r(count)    - Number of items parsed
    r(type)     - Database type processed
*/

program define unicefdata_xmltoyaml_py, rclass
    version 14.0
    
    syntax, TYPE(string) XMLFILE(string) OUTFILE(string) ///
        [AGENCY(string) VERSION(string) SOURCE(string) ///
         CODELISTID(string) CODELISTNAME(string) PYTHON(string) ///
         ENRICHDATAFLOWS FALLBACKSEQUENCESOUT(string)]
    
    * Set defaults
    if ("`agency'" == "") local agency "UNICEF"
    if ("`version'" == "") local version "2.0.0"
    
    * Validate type
    local valid_types "dataflows codelists countries regions dimensions attributes indicators"
    local type = lower("`type'")
    
    if (!strpos(" `valid_types' ", " `type' ")) {
        di as err "Invalid type: `type'"
        di as err "Valid types: `valid_types'"
        error 198
    }
    
    * Check XML file exists
    capture confirm file "`xmlfile'"
    if (_rc != 0) {
        di as err "XML file not found: `xmlfile'"
        error 601
    }
    
    * Find Python script location
    local script_name "unicefdata_xml2yaml.py"
    local script_path ""
    
    * First try the location relative to the current working directory
    foreach trypath in "stata/src/py/`script_name'" "`script_name'" {
        capture confirm file "`trypath'"
        if (_rc == 0) {
            local script_path "`trypath'"
            continue, break
        }
    }
    
    * Check Stata system directories for py/ subfolder
    * Use actual sysdir paths instead of symbolic adopath names
    if ("`script_path'" == "") {
        foreach sysdir in plus personal site base {
            local basepath = subinstr("`c(sysdir_`sysdir')'", "\", "/", .)
            if ("`basepath'" != "") {
                local trypath = "`basepath'py/`script_name'"
                capture confirm file "`trypath'"
                if (_rc == 0) {
                    local script_path "`trypath'"
                    continue, break
                }
            }
        }
    }
    
    * Also try the u/ directory where this ado lives (same location pattern)
    if ("`script_path'" == "") {
        foreach sysdir in plus personal site {
            local basepath = subinstr("`c(sysdir_`sysdir')'", "\", "/", .)
            if ("`basepath'" != "") {
                local trypath = "`basepath'u/`script_name'"
                capture confirm file "`trypath'"
                if (_rc == 0) {
                    local script_path "`trypath'"
                    continue, break
                }
            }
        }
    }
    
    if ("`script_path'" == "") {
        di as err "Python script not found: `script_name'"
        di as err "Searched in: sysdir_plus/py/, sysdir_personal/py/, sysdir_plus/u/"
        di as err "Please ensure unicefdata_xml2yaml.py is installed with the package"
        error 601
    }
    
    * Determine Python command
    if ("`python'" == "") {
        local python "python"
    }
    
    * Build metadata arguments (matching Python argparse format)
    local meta_args ""
    if ("`source'" != "") {
        local meta_args `"`meta_args' --source "`source'""'
    }
    if ("`codelistid'" != "") {
        local meta_args `"`meta_args' --codelist-id "`codelistid'""'
    }
    if ("`codelistname'" != "") {
        local meta_args `"`meta_args' --codelist-name "`codelistname'""'
    }
    if ("`enrichdataflows'" != "") {
        local meta_args `"`meta_args' --enrich-dataflows"'
    }
    if ("`fallbacksequencesout'" != "") {
        local meta_args `"`meta_args' --fallback-sequences-output "`fallbacksequencesout'""'
    }
    local meta_args `"`meta_args' --agency "`agency'" --version "`version'""'
    
    * Create temporary file for Python output
    tempfile pyout
    
    * Build and execute Python command
    local cmd `""`python'" "`script_path'" `type' "`xmlfile'" "`outfile'" `meta_args'"'
    
    * Debug: show the command
    di as text "  Script: `script_path'"
    di as text "  Running Python XML parser..."
    
    * Debug: show if fallback sequences is being used
    if ("`fallbacksequencesout'" != "") {
        di as text "  Fallback sequences output: `fallbacksequencesout'"
    }
    
    * For enrichment, show progress inline (don't redirect output)
    if ("`enrichdataflows'" != "") {
        di as text "  (Enriching with dataflow info - this takes ~1-2 minutes)"
        if ("`c(os)'" == "Windows") {
            shell `cmd'
        }
        else {
            shell `cmd'
        }
    }
    else {
        * Normal mode: capture output to temp file
        if ("`c(os)'" == "Windows") {
            * Windows: use shell
            shell `cmd' > "`pyout'" 2>&1
        }
        else {
            * Unix/Mac: use shell
            shell `cmd' > "`pyout'" 2>&1
        }
    }
    
    * Verify output file was created
    capture confirm file "`outfile'"
    if (_rc != 0) {
        * Show Python output for debugging
        di as err "Output file was not created: `outfile'"
        di as err "Python output:"
        capture {
            tempname fh
            file open `fh' using "`pyout'", read text
            file read `fh' line
            while !r(eof) {
                di as err "  `line'"
                file read `fh' line
            }
            file close `fh'
        }
        error 603
    }
    
    * Count items by counting "    code: " lines in output file (dict format)
    * The Python script outputs dict format: each item has "    code: XXX" line
    local count = 0
    tempname fh
    file open `fh' using "`outfile'", read text
    file read `fh' line
    while !r(eof) {
        * Match lines that start with "    code: " (4 spaces + code:)
        if (strmatch(`"`line'"', "    code: *") == 1) {
            local count = `count' + 1
        }
        file read `fh' line
    }
    file close `fh'
    
    * Return results
    return scalar count = `count'
    return local type "`type'"
    
    di as result "  Parsed `count' `type'"
end

* Note: The main wrapper unicefdata_xmltoyaml is now defined in unicefdata_xmltoyaml.ado
* This file only contains the Python-specific parser unicefdata_xmltoyaml_py