******************************************************************************* * unicefdata_xmltoyaml *! v 2.0.0 30Jan2026 by Joao Pedro Azevedo (UNICEF) * Generic XML to YAML parser for SDMX data structures * Supports both Python (preferred) and pure Stata (fallback) parsers * Added FALLBACKSEQUENCESOUT option for generating fallback sequences * Added ENRICHDATAFLOWS option for indicator metadata ******************************************************************************* /* DESCRIPTION: A generic XML parser that converts SDMX XML structures to YAML files. Uses a schema registry to define XML element paths and YAML field mappings for different database types (dataflows, codelists, countries, etc.) By default, uses Python for robust large-file handling, with automatic fallback to pure Stata if Python is not available. SUPPORTED TYPES: dataflows - SDMX dataflow definitions codelists - Generic codelist items countries - Country codes (CL_COUNTRY) regions - Regional codes (CL_WORLD_REGIONS) dimensions - DSD dimension definitions attributes - DSD attribute definitions indicators - Indicator codelist (CL_UNICEF_INDICATOR) SYNTAX: unicefdata_xmltoyaml, type(string) xmlfile(string) outfile(string) [agency(string) version(string) contenttype(string) codelistid(string) codelistname(string) syncedat(string) source(string) append forcepython forcestata enrichdataflows fallbacksequencesout(string)] OPTIONS: type(string) - Database type (see SUPPORTED TYPES) xmlfile(string) - Input XML file path outfile(string) - Output YAML file path agency(string) - Agency name (default: UNICEF) version(string) - Metadata version (default: 2.0.0) contenttype(string) - Content type for watermark codelistid(string) - Codelist ID for countries/regions codelistname(string)- Codelist name for countries/regions syncedat(string) - Sync timestamp (auto-generated if not provided) source(string) - Source URL for watermark append - Append to existing file (no header) forcepython - Force use of Python parser (requires Python 3.6+) forcestata - Force use of pure Stata parser (no Python required) enrichdataflows - For indicators: query API to add dataflows field (requires Python + requests package, takes ~1-2 min) fallbacksequencesout(string) - Also generate fallback sequences YAML to this path (only with enrichdataflows option) RETURNS: r(count) - Number of items parsed r(type) - Database type processed r(parser) - Parser used ("python" or "stata") EXAMPLE: tempfile xml_data copy "https://sdmx.data.unicef.org/.../dataflow/UNICEF" "`xml_data'", public * Auto-detect best parser unicefdata_xmltoyaml, type(dataflows) xmlfile("`xml_data'") /// outfile("metadata/dataflows.yaml") agency(UNICEF) * Force Python parser unicefdata_xmltoyaml, type(dataflows) xmlfile("`xml_data'") /// outfile("metadata/dataflows.yaml") agency(UNICEF) forcepython * Force Stata parser (no Python required) unicefdata_xmltoyaml, type(dataflows) xmlfile("`xml_data'") /// outfile("metadata/dataflows_stataonly.yaml") agency(UNICEF) forcestata * Indicators with dataflow enrichment unicefdata_xmltoyaml, type(indicators) xmlfile("`xml_data'") /// outfile("metadata/indicators.yaml") agency(UNICEF) enrichdataflows */ ******************************************************************************* * Main entry point - wrapper with parser selection ******************************************************************************* program define unicefdata_xmltoyaml, rclass version 14.0 syntax, TYPE(string) XMLFILE(string) OUTFILE(string) /// [AGENCY(string) VERSION(string) CONTENTTYPE(string) /// CODELISTID(string) CODELISTNAME(string) SYNCEDAT(string) /// SOURCE(string) APPEND FORCEPYTHON FORCESTATA ENRICHDATAFLOWS /// FALLBACKSEQUENCESOUT(string)] * Set defaults if ("`agency'" == "") local agency "UNICEF" if ("`version'" == "") local version "2.0.0" if ("`contenttype'" == "") local contenttype "`type'" * Generate timestamp if not provided if ("`syncedat'" == "") { local syncedat : di %tcCCYY-NN-DD!THH:MM:SS clock("`c(current_date)' `c(current_time)'", "DMYhms") local syncedat = trim("`syncedat'") + "Z" } * Validate type local valid_types "dataflows codelists countries regions dimensions attributes indicators" local type = lower("`type'") if (!strpos(" `valid_types' ", " `type' ")) { di as err "Invalid type: `type'" di as err "Valid types: `valid_types'" error 198 } * Check that both force options are not specified if ("`forcepython'" != "" & "`forcestata'" != "") { di as err "Cannot specify both forcepython and forcestata options" error 198 } * Determine which parser to use local use_python = 0 local chunk_threshold = 500000 // 500KB - use Python for larger files if ("`forcepython'" != "") { local use_python = 1 } else if ("`forcestata'" != "") { local use_python = 0 } else { * Auto-detect based on file size quietly { capture checksum "`xmlfile'" if (_rc == 0 & !missing(r(filelen))) { if (r(filelen) > `chunk_threshold') { local use_python = 1 } } } } * Get schema configuration for this type _xmltoyaml_get_schema, type(`type') local xml_root "`r(xml_root)'" local open_tag "`r(open_tag)'" local close_tag "`r(close_tag)'" local xml_filter "`r(xml_filter)'" local id_attr "`r(id_attr)'" local name_element "`r(name_element)'" local desc_element "`r(desc_element)'" local extra_attrs "`r(extra_attrs)'" local yaml_fields "`r(yaml_fields)'" local list_name "`r(list_name)'" local parser_used = "" local count = 0 if (`use_python') { * Use Python parser * Build enrichdataflows option for Python local enrich_opt "" if ("`enrichdataflows'" != "") { local enrich_opt "enrichdataflows" } * Build fallback sequences option for Python local fallback_opt "" if ("`fallbacksequencesout'" != "") { local fallback_opt `"fallbacksequencesout("`fallbacksequencesout'")"' } capture noisily unicefdata_xmltoyaml_py, /// type("`type'") /// xmlfile("`xmlfile'") /// outfile("`outfile'") /// agency("`agency'") /// version("`version'") /// source("`source'") /// codelistid("`codelistid'") /// codelistname("`codelistname'") /// `enrich_opt' `fallback_opt' local py_rc = _rc if (`py_rc' == 0) { local count = r(count) local parser_used "python" } else if ("`forcepython'" != "") { * Python was forced but failed di as err "Python parser failed with error `py_rc'" error `py_rc' } else { * Python failed, fall back to Stata di as txt " Python not available, falling back to Stata parser..." local use_python = 0 } } if (`use_python' == 0 & "`parser_used'" == "") { * Use native Stata parser _xmltoyaml_parse, /// xmlfile("`xmlfile'") /// outfile("`outfile'") /// type("`type'") /// xmlroot("`xml_root'") /// idattr("`id_attr'") /// opentag("`open_tag'") /// closetag("`close_tag'") /// nameelement("`name_element'") /// descelement("`desc_element'") /// extraattrs("`extra_attrs'") /// yamlfields("`yaml_fields'") /// listname("`list_name'") /// agency("`agency'") /// version("`version'") /// contenttype("`contenttype'") /// codelistid("`codelistid'") /// codelistname("`codelistname'") /// syncedat("`syncedat'") /// source("`source'") /// `append' forcestata local count = r(count) local parser_used "stata" } * Return results return scalar count = `count' return local type "`type'" return local parser "`parser_used'" end