"""littext_run: single-entry script invoked by _littext_analyze.ado via `python script`. This file expects the following Stata locals to be set by the calling .ado: pypath absolute path to the package's python/ subdirectory corpus_dta path to the temporary .dta written by the calling .ado unit unit of analysis ("sentence", "abstract", or "paragraph") embedmodel sentence-transformers model name minfreq minimum document frequency for constructs (as string) maxrelations cap on candidate relationships (as string) addsent "1" to add VADER polarity, "0" to skip q "1" for quiet, "0" for verbose mintextlen minimum text length in characters (v0.3 Tier-1; as string) keepempty_flag "1" to skip row-drop, "0" otherwise (v0.3 Tier-1) texttype text-kind declaration (v0.3 Tier-2): one of abstract / fulltext / transcript / review / comment / other The script populates three pre-existing Stata frames (lt_constructs, lt_relations, lt_diag) via sfi.Frame. The calling .ado is responsible for creating those frames before invoking this script. """ from __future__ import annotations import os import sys import traceback def _main() -> None: # Quiet environment settings before anything heavy is imported os.environ.setdefault("CUDA_VISIBLE_DEVICES", "") os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") from sfi import Macro # Resolve the package's python/ folder and make it importable pypath = os.path.abspath(Macro.getLocal("pypath")) if pypath not in sys.path: sys.path.insert(0, pypath) # Read all parameters from Stata locals corpus_path = Macro.getLocal("corpus_dta") unit = Macro.getLocal("unit") or "sentence" embed_model = Macro.getLocal("embedmodel") or "all-MiniLM-L6-v2" min_freq = int(Macro.getLocal("minfreq") or "2") max_relations = int(Macro.getLocal("maxrelations") or "100000") add_sent = bool(int(Macro.getLocal("addsent") or "0")) quiet = bool(int(Macro.getLocal("q") or "0")) min_text_len = int(Macro.getLocal("mintextlen") or "0") keep_empty = bool(int(Macro.getLocal("keepempty_flag") or "0")) texttype = Macro.getLocal("texttype") or "abstract" if not quiet: print(f" [py] entry: corpus={corpus_path!r}", flush=True) print(f" [py] unit={unit} embed={embed_model} minfreq={min_freq}", flush=True) print(f" [py] max_relations={max_relations} add_sentiment={add_sent}", flush=True) print(f" [py] mintextlen={min_text_len} keepempty={keep_empty}", flush=True) print(f" [py] texttype={texttype}", flush=True) # Import the pipeline and run it. Imports happen here (one bridge crossing), # so any spaCy / sentence-transformers cold-load cost is paid inside this # single Stata-Python call. from littext_pipeline import run_pipeline run_pipeline( corpus_path=corpus_path, unit=unit, embed_model=embed_model, min_freq=min_freq, max_relations=max_relations, add_sentiment=add_sent, quiet=quiet, min_text_len=min_text_len, keep_empty=keep_empty, texttype=texttype, ) try: _main() except Exception: # Print the full traceback so the user can see it in Stata's results window. print("littext: ERROR during pipeline execution:", flush=True) print(traceback.format_exc(), flush=True) raise