"""litdiscover_net.py v0.3.0 14may2026 Copyright (C) 2026 Nebojsa S. Davcik, EM Normandie Business School. Email: davcik@live.com. ORCID: 0000-0003-1041-8788. Repository: https://github.com/Davcik/litdiscover Licensed under the GNU General Public License version 3 or later (GPL-3.0-or-later). This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE file in the repository root, or , for details. Block C (v0.3): network-analytic measures over the v0.2 construct co-occurrence tables. Reads two Stata datasets (paths supplied via Stata locals): - withindta litdiscover_cooc_within.dta columns: field, value_a, value_b, n_both, n_a, n_b, jaccard - crossdta litdiscover_cooc_cross.dta columns: field_a, value_a, field_b, value_b, n_both, n_a, n_b, jaccard Writes two CSV files (paths supplied via Stata locals): - withincsv one row per (field, value); within-field networks - crosscsv one row per (field_a, field_b, field, value); bipartite The driving .ado imports each CSV via `import delimited` and saves as .dta with the v0.2 file-naming convention (litdiscover_*). Algorithm follows the v0.3 locked spec, sections A6 through A11. Constants (hard-coded in v0.3; exposure deferred to v0.4): Louvain seed = 20250101 Louvain resolution = 1.0 Edge attribute 'sim' = jaccard (Louvain and strength) Edge attribute 'dist' = 1 - jaccard (betweenness; distance form) Surfaces summary scalars back to Stata via Macro.setLocal so that the .ado can return them as r() scalars without writing extra files: net_networks_within, net_networks_cross, net_nodes_within, net_nodes_cross, net_modularity_mean, net_modularity_min, net_modularity_max, net_louvain_seed. """ import os os.environ["LOKY_MAX_CPU_COUNT"] = "1" os.environ["JOBLIB_MULTIPROCESSING"] = "0" import warnings warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn") warnings.filterwarnings("ignore", category=DeprecationWarning, module="sklearn") warnings.filterwarnings("ignore", category=UserWarning, module="joblib") from sfi import Macro import math import numpy as np import pandas as pd import networkx as nx from networkx.algorithms import community as nx_community LOUVAIN_SEED = 20250101 LOUVAIN_RESOLUTION = 1.0 def _read_stata_safe(path): """Read a .dta, returning an empty DataFrame on read failure.""" try: return pd.read_stata(path, convert_categoricals=False) except (FileNotFoundError, ValueError) as exc: print("LITDISCOVER_NET: could not read", path, "-", str(exc)) return pd.DataFrame() def _build_graph(edge_rows, node_id_fn): """Build an undirected weighted graph from an iterable of edge rows. edge_rows: iterable of dicts with keys 'u', 'v', 'jaccard'. node_id_fn: identity function for simple graphs; namespacing for cross. """ G = nx.Graph() for r in edge_rows: j = r["jaccard"] if j is None: continue if isinstance(j, float) and math.isnan(j): continue if j <= 0.0: continue u = node_id_fn(r["u"]) v = node_id_fn(r["v"]) if u == v: continue G.add_edge(u, v, sim=float(j), dist=float(1.0 - j)) return G def _measures_for_graph(G): """Return per-node measure dict and network-level scalars.""" if G.number_of_nodes() == 0: return {}, 0, 0, float("nan"), [] deg_norm = nx.degree_centrality(G) strength = dict(G.degree(weight="sim")) btw = nx.betweenness_centrality(G, weight="dist", normalized=True) try: communities = nx_community.louvain_communities( G, weight="sim", resolution=LOUVAIN_RESOLUTION, seed=LOUVAIN_SEED, ) except Exception as exc: print("LITDISCOVER_NET: Louvain failed -", str(exc), "; falling back to singletons") communities = [{n} for n in G.nodes()] node_to_comm = {} for c_idx, members in enumerate(communities): for n in members: node_to_comm[n] = c_idx try: mod_val = nx_community.modularity(G, communities, weight="sim") except Exception as exc: print("LITDISCOVER_NET: modularity failed -", str(exc), "; setting NaN") mod_val = float("nan") per_node = {} for n in G.nodes(): per_node[n] = { "degree": float(deg_norm.get(n, 0.0)), "strength": float(strength.get(n, 0.0)), "betweenness": float(btw.get(n, 0.0)), "community": int(node_to_comm.get(n, 0)), } return ( per_node, int(G.number_of_nodes()), int(G.number_of_edges()), float(mod_val), communities, ) def process_within(within_df): """Build one within-field network per distinct value of 'field'.""" out_rows = [] network_count = 0 mods = [] if within_df.empty: return pd.DataFrame(columns=[ "field", "value", "n_nodes", "n_edges", "degree", "strength", "betweenness", "community", "modularity", ]), network_count, mods for field_val, sub in within_df.groupby("field", sort=True): edges = [] for _, row in sub.iterrows(): edges.append({ "u": row["value_a"], "v": row["value_b"], "jaccard": row["jaccard"], }) G = _build_graph(edges, node_id_fn=lambda x: x) if G.number_of_nodes() == 0: continue per_node, n_nodes, n_edges, mod_val, _ = _measures_for_graph(G) network_count += 1 if not (isinstance(mod_val, float) and math.isnan(mod_val)): mods.append(mod_val) for n, m in per_node.items(): out_rows.append({ "field": str(field_val), "value": str(n), "n_nodes": n_nodes, "n_edges": n_edges, "degree": m["degree"], "strength": m["strength"], "betweenness": m["betweenness"], "community": m["community"], "modularity": mod_val, }) out_df = pd.DataFrame(out_rows, columns=[ "field", "value", "n_nodes", "n_edges", "degree", "strength", "betweenness", "community", "modularity", ]) out_df.sort_values(["field", "value"], kind="mergesort", inplace=True) return out_df, network_count, mods def process_cross(cross_df): """Build one bipartite network per unordered (field_a, field_b) pair.""" cols_out = [ "field_a", "field_b", "field", "value", "n_nodes", "n_edges", "degree", "strength", "betweenness", "community", "modularity", ] out_rows = [] network_count = 0 mods = [] if cross_df.empty: return pd.DataFrame(columns=cols_out), network_count, mods grouped = cross_df.groupby(["field_a", "field_b"], sort=True) for (fa, fb), sub in grouped: edges = [] for _, row in sub.iterrows(): edges.append({ "u": (str(fa), str(row["value_a"])), "v": (str(fb), str(row["value_b"])), "jaccard": row["jaccard"], }) def ns(node_tuple): return node_tuple[0] + "::" + node_tuple[1] edges_ns = [] for e in edges: j = e["jaccard"] if j is None: continue if isinstance(j, float) and math.isnan(j): continue if j <= 0.0: continue edges_ns.append({"u": ns(e["u"]), "v": ns(e["v"]), "jaccard": j}) G = _build_graph(edges_ns, node_id_fn=lambda x: x) if G.number_of_nodes() == 0: continue per_node, n_nodes, n_edges, mod_val, _ = _measures_for_graph(G) network_count += 1 if not (isinstance(mod_val, float) and math.isnan(mod_val)): mods.append(mod_val) for n, m in per_node.items(): sep_idx = n.find("::") n_field = n[:sep_idx] n_value = n[sep_idx + 2:] out_rows.append({ "field_a": str(fa), "field_b": str(fb), "field": n_field, "value": n_value, "n_nodes": n_nodes, "n_edges": n_edges, "degree": m["degree"], "strength": m["strength"], "betweenness": m["betweenness"], "community": m["community"], "modularity": mod_val, }) out_df = pd.DataFrame(out_rows, columns=cols_out) out_df.sort_values(["field_a", "field_b", "field", "value"], kind="mergesort", inplace=True) return out_df, network_count, mods def main(): within_path = Macro.getLocal("withindta") cross_path = Macro.getLocal("crossdta") within_csv = Macro.getLocal("withincsv") cross_csv = Macro.getLocal("crosscsv") print("LITDISCOVER_NET: reading", within_path) within_df = _read_stata_safe(within_path) print("LITDISCOVER_NET: reading", cross_path) cross_df = _read_stata_safe(cross_path) print("LITDISCOVER_NET: building within-field networks") within_out, n_within, mods_within = process_within(within_df) print("LITDISCOVER_NET: within networks =", n_within, ", rows =", len(within_out)) print("LITDISCOVER_NET: building cross-field bipartite networks") cross_out, n_cross, mods_cross = process_cross(cross_df) print("LITDISCOVER_NET: cross networks =", n_cross, ", rows =", len(cross_out)) with open(within_csv, "w", encoding="utf-8", newline="") as fh: within_out.to_csv(fh, index=False) print("LITDISCOVER_NET: wrote", within_csv) with open(cross_csv, "w", encoding="utf-8", newline="") as fh: cross_out.to_csv(fh, index=False) print("LITDISCOVER_NET: wrote", cross_csv) all_mods = mods_within # spec: scalars summarise within networks only if len(all_mods) > 0: mod_mean = float(np.mean(all_mods)) mod_min = float(np.min(all_mods)) mod_max = float(np.max(all_mods)) else: mod_mean = float("nan") mod_min = float("nan") mod_max = float("nan") Macro.setLocal("net_networks_within", str(int(n_within))) Macro.setLocal("net_networks_cross", str(int(n_cross))) Macro.setLocal("net_nodes_within", str(int(len(within_out)))) Macro.setLocal("net_nodes_cross", str(int(len(cross_out)))) Macro.setLocal("net_modularity_mean", str(mod_mean)) Macro.setLocal("net_modularity_min", str(mod_min)) Macro.setLocal("net_modularity_max", str(mod_max)) Macro.setLocal("net_louvain_seed", str(int(LOUVAIN_SEED))) main()