Source code for ssbc.metrics.loo_uncertainty

"""
Small-sample uncertainty quantification for LOO-CV conformal prediction.

This module handles all four sources of uncertainty:
1. LOO-CV correlation structure
2. Threshold calibration uncertainty
3. Parameter estimation uncertainty
4. Test sampling uncertainty

Designed for small calibration sets (n=20-40) where bootstrap is unreliable.
"""

import warnings
from typing import Any

import numpy as np
from scipy.stats import beta as beta_dist
from scipy.stats import binom, norm
from scipy.stats import t as t_dist


[docs] def estimate_loo_inflation_factor(loo_predictions: np.ndarray, verbose: bool = True) -> float: """ Estimate the actual variance inflation from LOO-CV for this specific problem. Theory predicts inflation ≈ 2×, but the actual value depends on the model and data structure. This computes an empirical estimate. Parameters: ----------- loo_predictions : np.ndarray, shape (n_cal,) Binary LOO predictions (1=success, 0=failure) Returns: -------- inflation_factor : float Estimated variance inflation, clipped to [1.0, 6.0] Typically ≈ 2.0 for LOO-CV Notes: ------ Inflation = Var_empirical / Var_IID For n → ∞: inflation → 2.0 For small n: inflation can vary, so we clip to reasonable range """ n = len(loo_predictions) p_hat = np.mean(loo_predictions) # Empirical variance with Bessel correction var_empirical = np.var(loo_predictions, ddof=1) if n > 1 else p_hat * (1 - p_hat) # Theoretical IID variance var_iid = p_hat * (1 - p_hat) # Compute inflation factor if var_iid > 1e-10: # Avoid division by zero inflation = (var_empirical / var_iid) * n / (n - 1) else: # Edge case: p_hat ≈ 0 or 1 inflation = 2.0 # Clip to reasonable range # Lower bound: 1.0 (can't be less than IID) # Upper bound: 6.0 (extended range for high correlation scenarios) inflation = np.clip(inflation, 1.0, 6.0) # Print the estimated inflation factor for visibility if verbose: print(f"LOO inflation factor estimated: {inflation:.3f} (clipped to [1.0, 6.0] range)") return inflation
[docs] def compute_loo_corrected_bounds_analytical( loo_predictions: np.ndarray, n_test: int, alpha: float = 0.05, use_t_distribution: bool = True, inflation_factor: float | None = None, ) -> tuple[float, float, dict[str, Any]]: """ METHOD 1: Analytical bounds with LOO correction. This method: - Uses empirical variance of LOO predictions (captures correlation) - Applies theoretical LOO inflation as safety check - Uses t-distribution for small-sample critical values - Combines calibration and test sampling uncertainty Best for: n_cal, n_test ≥ 40 AND inflation_factor ≥ 1.2 Note: When inflation_factor ≈ 1 (low correlation), the normal approximation becomes unreliable. Use Method 2 (exact binomial) instead in these cases. Parameters: ----------- loo_predictions : np.ndarray, shape (n_cal,) Binary LOO predictions n_test : int Size of future test sets alpha : float Significance level (e.g., 0.05 for 95% confidence) use_t_distribution : bool If True, use t-distribution (recommended for n < 50) inflation_factor : float or None Manual override for LOO inflation. If None, auto-estimated. Returns: -------- L_prime : float Lower prediction bound U_prime : float Upper prediction bound diagnostics : dict Detailed breakdown of variance components """ n_cal = len(loo_predictions) p_hat = np.mean(loo_predictions) # Small sample warning if n_cal < 20: warnings.warn( f"n_cal={n_cal} is very small. Consider using Method 2 (exact binomial) " "or Method 3 (Hoeffding) for more conservative bounds.", stacklevel=2, ) # Inflation factor should already be estimated at the top level # This is just for backward compatibility # Variance components # Source #1 & #2: LOO variance with correlation correction # Method A: Empirical variance (captures actual correlation in data) s_squared = np.var(loo_predictions, ddof=1) if n_cal > 1 else p_hat * (1 - p_hat) var_calibration_empirical = s_squared / n_cal # Method B: Theoretical with inflation factor # For small n, use (n-1) in denominator for bias correction if inflation_factor is not None: var_calibration_theoretical = inflation_factor * p_hat * (1 - p_hat) / (n_cal - 1) else: # Fallback to empirical variance if no inflation factor provided var_calibration_theoretical = var_calibration_empirical # Use the LARGER of the two (conservative) var_calibration = max(var_calibration_empirical, var_calibration_theoretical) # Warn if inflation factor is very low - normal approximation may be unreliable if inflation_factor is not None and inflation_factor < 1.2: warnings.warn( f"inflation_factor={inflation_factor:.3f} is very low (< 1.2), indicating low LOO correlation. " "The analytical method using normal approximation may be unreliable in this regime. " "Consider using method='exact' (beta-binomial) for more accurate bounds.", UserWarning, stacklevel=2, ) # Source #4: Test sampling variance var_test = p_hat * (1 - p_hat) / n_test # Total variance (sources are independent) var_total = var_calibration + var_test se_total = np.sqrt(var_total) # Critical value if use_t_distribution and n_cal > 2: # Use t-distribution with df = n_cal - 1 df = n_cal - 1 critical_value = t_dist.ppf(1 - alpha / 2, df) else: critical_value = norm.ppf(1 - alpha / 2) # Construct bounds L_prime = max(0.0, p_hat - critical_value * se_total) U_prime = min(1.0, p_hat + critical_value * se_total) # Diagnostics diagnostics = { "p_hat": p_hat, "n_cal": n_cal, "n_test": n_test, "inflation_factor": inflation_factor, "var_calibration": var_calibration, "var_test": var_test, "var_total": var_total, "se_total": se_total, "critical_value": critical_value, "width": U_prime - L_prime, } return L_prime, U_prime, diagnostics
[docs] def compute_loo_corrected_bounds_exact_binomial( k_loo: int, n_cal: int, n_test: int, alpha: float = 0.05, inflation_factor: float = 2.0 ) -> tuple[float, float, dict[str, float]]: """ METHOD 2: Exact binomial with effective sample size (CONSERVATIVE). This method: - Uses exact beta/binomial distributions (no normal approximation) - Computes effective sample size accounting for LOO correlation - Uses worst-case union bound for combining uncertainties - Works directly with discrete probabilities Best for: n_cal = 20-40, when you need guaranteed coverage Parameters: ----------- k_loo : int Number of LOO successes n_cal : int Calibration set size n_test : int Test set size alpha : float Significance level inflation_factor : float LOO variance inflation (typically 1.5-2.5) Returns: -------- L_prime, U_prime : float Prediction bounds diagnostics : dict Detailed breakdown """ p_hat = k_loo / n_cal # Effective sample size after accounting for LOO correlation # If inflation_factor = 2, then n_effective = n_cal / 2 n_effective = n_cal / inflation_factor # Step 1: Wide confidence interval for p using effective sample size # Scale k to effective sample size for beta distribution k_effective = k_loo / inflation_factor n_effective_int = int(np.round(n_effective)) k_effective_int = int(np.round(k_effective)) # Split alpha budget: half for calibration CI, half for test sampling alpha_cal = alpha / 2 alpha_test = alpha / 2 # Clopper-Pearson bounds on effective sample if k_effective_int == 0: p_lower = 0.0 else: p_lower = beta_dist.ppf(alpha_cal / 2, k_effective_int, n_effective_int - k_effective_int + 1) if k_effective_int == n_effective_int: p_upper = 1.0 else: p_upper = beta_dist.ppf(1 - alpha_cal / 2, k_effective_int + 1, n_effective_int - k_effective_int) # Step 2: Worst-case test sampling at boundaries # Lower bound: assume p = p_lower, take pessimistic quantile if p_lower > 0: L_prime = binom.ppf(alpha_test / 2, n_test, p_lower) / n_test else: L_prime = 0.0 # Upper bound: assume p = p_upper, take optimistic quantile if p_upper < 1: U_prime = binom.ppf(1 - alpha_test / 2, n_test, p_upper) / n_test else: U_prime = 1.0 diagnostics = { "p_hat": p_hat, "n_cal": n_cal, "n_effective": n_effective, "n_test": n_test, "inflation_factor": inflation_factor, "p_lower": p_lower, "p_upper": p_upper, "width": U_prime - L_prime, } return L_prime, U_prime, diagnostics
[docs] def compute_loo_corrected_bounds_hoeffding( loo_predictions: np.ndarray, n_test: int, alpha: float = 0.05, inflation_factor: float | None = None, verbose: bool = True, ) -> tuple[float, float, dict[str, Any]]: """ METHOD 3: Distribution-free Hoeffding bound (ULTRA-CONSERVATIVE). This method: - Uses Hoeffding concentration inequality (no distributional assumptions) - Accounts for LOO correlation via adaptive effective sample size - Provides guaranteed coverage regardless of distribution - Widest bounds, suitable as worst-case / sanity check Best for: When you absolutely need guaranteed coverage Parameters: ----------- loo_predictions : np.ndarray Binary LOO predictions n_test : int Test set size alpha : float Significance level inflation_factor : float, optional LOO correlation inflation factor. If None, uses conservative default of 2.0. Returns: -------- L_prime, U_prime : float Prediction bounds diagnostics : dict """ n_cal = len(loo_predictions) p_hat = np.mean(loo_predictions) # Use inflation factor if provided, otherwise use conservative default of 2.0 if inflation_factor is None: inflation_factor = 2.0 if verbose: print(f"Using conservative default inflation factor: {inflation_factor:.3f}") else: if verbose: print(f"Using provided inflation factor for Hoeffding: {inflation_factor:.3f}") # Effective sample size (accounts for LOO correlation) n_effective_cal = n_cal / inflation_factor n_effective_test = n_test # Hoeffding bound: P(|p̂ - p| > ε) ≤ 2 exp(-2nε²) # Setting 2 exp(-2nε²) = α/2, solve for ε: # ε = sqrt(log(4/α) / (2n)) # Split alpha: half for calibration, half for test epsilon_cal = np.sqrt(np.log(4 / alpha) / (2 * n_effective_cal)) epsilon_test = np.sqrt(np.log(4 / alpha) / (2 * n_effective_test)) # Union bound: total epsilon epsilon_total = epsilon_cal + epsilon_test # Bounds L_prime = max(0.0, p_hat - epsilon_total) U_prime = min(1.0, p_hat + epsilon_total) diagnostics = { "p_hat": p_hat, "n_cal": n_cal, "inflation_factor": inflation_factor, "n_effective_cal": n_effective_cal, "n_test": n_test, "epsilon_cal": epsilon_cal, "epsilon_test": epsilon_test, "epsilon_total": epsilon_total, "width": U_prime - L_prime, } return L_prime, U_prime, diagnostics
[docs] def compute_loo_corrected_prediction_bounds( loo_predictions: np.ndarray, n_test: int, alpha: float = 0.05, method: str = "simple", inflation_factor: float | None = None, verbose: bool = True, ) -> tuple[float, float, dict]: """Compute prediction bounds using Clopper-Pearson + sampling uncertainty with LOO correction. This function provides the correct statistical approach: 1. Use Clopper-Pearson to bound the true rate p from calibration data 2. Add sampling uncertainty for future test sets of size n_test 3. Account for LOO-CV correlation structure Parameters ---------- loo_predictions : np.ndarray, shape (n_cal,) Binary LOO predictions (1=success, 0=failure) n_test : int Expected size of future test sets alpha : float Significance level (e.g., 0.05 for 95% confidence) method : str 'simple' - Clopper-Pearson + normal approximation for sampling uncertainty 'beta_binomial' - Exact Beta-Binomial approach 'hoeffding' - Distribution-free Hoeffding bounds inflation_factor : float, optional Manual override for LOO variance inflation factor verbose : bool, default=True Print diagnostic information Returns ------- tuple[float, float, dict] (lower_bound, upper_bound, diagnostics) """ n_cal = len(loo_predictions) k_cal = int(np.sum(loo_predictions)) p_hat = np.mean(loo_predictions) if method == "simple": # Step 1: Clopper-Pearson bounds on true rate p (calibration uncertainty) from scipy.stats import beta as beta_dist if k_cal == 0: cp_lower = 0.0 cp_upper = beta_dist.ppf(1 - alpha / 2, k_cal + 1, n_cal - k_cal) elif k_cal == n_cal: cp_lower = beta_dist.ppf(alpha / 2, k_cal, n_cal - k_cal + 1) cp_upper = 1.0 else: cp_lower = beta_dist.ppf(alpha / 2, k_cal, n_cal - k_cal + 1) cp_upper = beta_dist.ppf(1 - alpha / 2, k_cal + 1, n_cal - k_cal) # Step 2: Add sampling uncertainty for test set # Use LOO-corrected variance if inflation factor provided if inflation_factor is None: inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False) # Sampling uncertainty: Var(X_test) = p(1-p)/n_test # Use conservative bounds from Clopper-Pearson sampling_var_lower = cp_lower * (1 - cp_lower) / n_test sampling_var_upper = cp_upper * (1 - cp_upper) / n_test # Total uncertainty: calibration + sampling se_lower = np.sqrt(sampling_var_lower) se_upper = np.sqrt(sampling_var_upper) # Normal approximation for sampling uncertainty z_critical = norm.ppf(1 - alpha / 2) lower = max(0.0, cp_lower - z_critical * se_lower) upper = min(1.0, cp_upper + z_critical * se_upper) diagnostics = { "method": "clopper_pearson_plus_sampling", "cp_lower": cp_lower, "cp_upper": cp_upper, "sampling_se_lower": se_lower, "sampling_se_upper": se_upper, "inflation_factor": inflation_factor, "z_critical": z_critical, } elif method == "beta_binomial": # Use exact Beta-Binomial approach with LOO correction if inflation_factor is None: inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False) # Effective sample size accounting for LOO correlation n_effective = n_cal / inflation_factor # Construct effective counts that preserve p_hat while ensuring integers n_eff_int = max(1, int(np.floor(n_effective))) k_eff_int = int(np.floor(p_hat * n_eff_int)) # Use beta-binomial bounds with effective sample size from ssbc.bounds import prediction_bounds_beta_binomial lower, upper = prediction_bounds_beta_binomial(k_eff_int, n_eff_int, n_test, 1 - alpha) diagnostics = { "method": "beta_binomial_loo_corrected", "inflation_factor": inflation_factor, "n_effective": n_effective, "n_eff_int": n_eff_int, "k_eff_int": k_eff_int, } elif method == "hoeffding": # Use Hoeffding's inequality for distribution-free bounds if inflation_factor is None: inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False) # Hoeffding bound: P(|X - E[X]| >= t) <= 2 * exp(-2 * t^2 / n) # For prediction bounds, we need to account for both calibration and test uncertainty # Calibration uncertainty (LOO-corrected) # Hoeffding bound on calibration mean t_cal = np.sqrt(np.log(2 / alpha) / (2 * n_cal)) cal_lower = max(0.0, p_hat - t_cal) cal_upper = min(1.0, p_hat + t_cal) # Test sampling uncertainty # Hoeffding bound on test set mean t_test = np.sqrt(np.log(2 / alpha) / (2 * n_test)) # Conservative approach: use worst-case bounds lower = max(0.0, cal_lower - t_test) upper = min(1.0, cal_upper + t_test) diagnostics = { "method": "hoeffding_distribution_free", "inflation_factor": inflation_factor, "t_cal": t_cal, "t_test": t_test, "cal_lower": cal_lower, "cal_upper": cal_upper, } else: raise ValueError(f"Unknown method: {method}. Use 'simple', 'beta_binomial', or 'hoeffding'.") return lower, upper, diagnostics
[docs] def compute_robust_prediction_bounds( loo_predictions: np.ndarray, n_test: int, alpha: float = 0.05, method: str = "auto", inflation_factor: float | None = None, verbose: bool = True, ) -> tuple[float, float, dict]: """ Main function: Compute robust prediction bounds for small-sample LOO-CV. This is the primary entry point. It intelligently selects methods based on sample size and provides comprehensive diagnostics. Parameters: ----------- loo_predictions : np.ndarray, shape (n_cal,) Binary LOO predictions (1=singleton/success, 0=not/failure) n_test : int Expected size of future test sets alpha : float Significance level (e.g., 0.05 for 95% confidence) method : str 'auto' - Automatically select best method (recommended) 'analytical' - Method 1: Analytical with LOO correction 'exact' - Method 2: Exact binomial with effective n 'hoeffding' - Method 3: Distribution-free bound 'all' - Compute all three and report inflation_factor : float, optional Manual override for LOO variance inflation factor. If None, automatically estimated. Typical values: 1.0 (no inflation), 2.0 (standard LOO), 1.5-2.5 (empirical range) verbose : bool, default=True If True, print diagnostic information about method selection and inflation factors. Returns: -------- L_prime : float Lower prediction bound U_prime : float Upper prediction bound report : dict Comprehensive diagnostics and method comparison Usage Examples: --------------- # Basic usage (auto-selects best method) L, U, report = compute_robust_prediction_bounds(loo_preds, n_test=50) # Force conservative method L, U, report = compute_robust_prediction_bounds( loo_preds, n_test=50, method='exact' ) # Compare all methods L, U, report = compute_robust_prediction_bounds( loo_preds, n_test=50, method='all' ) print(report['comparison_table']) """ n_cal = len(loo_predictions) k_loo = int(np.sum(loo_predictions)) # Always estimate inflation factor first (needed for method selection) estimated_inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False) # Use provided inflation factor if available, otherwise use estimated effective_inflation_factor = inflation_factor if inflation_factor is not None else estimated_inflation_factor # Auto-select method based on sample size AND inflation factor # When inflation_factor ≈ 1, normal approximation in analytical method is unreliable # Prefer exact (beta-binomial) method in these cases if method == "auto": # If inflation factor is very close to 1, analytical method unreliable # Use exact method instead (beta-binomial handles low correlation better) if effective_inflation_factor < 1.2: # Low correlation case: prefer exact method for reliability if n_cal >= 20: method = "exact" else: method = "hoeffding" if verbose: print( f"Auto-selected LOO method: {method} " f"(n_cal={n_cal}, inflation_factor={effective_inflation_factor:.3f} < 1.2, " "analytical method unreliable for low correlation)" ) elif n_cal >= 40: method = "analytical" if verbose: print(f"Auto-selected LOO method: {method} (n_cal={n_cal})") elif n_cal >= 25: method = "exact" if verbose: print(f"Auto-selected LOO method: {method} (n_cal={n_cal})") else: method = "hoeffding" if verbose: print(f"Auto-selected LOO method: {method} (n_cal={n_cal})") # Use inflation factor for calculations based on method requirements if inflation_factor is None: if method in ["analytical", "hoeffding"]: inflation_factor = estimated_inflation_factor if verbose: print(f"Using estimated LOO inflation factor: {inflation_factor:.3f}") else: # For exact method, still use estimated factor inflation_factor = estimated_inflation_factor if verbose: print(f"Using estimated LOO inflation factor: {inflation_factor:.3f} (exact method)") else: # User provided value - use it for calculations, but still report estimated if verbose: print( "Using provided LOO inflation factor: " f"{inflation_factor:.3f} " "(estimated from data: " f"{estimated_inflation_factor:.3f})" ) # Compute bounds with selected method if method == "analytical": L, U, diag = compute_loo_corrected_bounds_analytical( loo_predictions, n_test, alpha, inflation_factor=inflation_factor ) selected_method = "analytical" elif method == "exact": # Pass through provided/estimated inflation factor for consistency infl = ( inflation_factor if inflation_factor is not None else estimate_loo_inflation_factor(loo_predictions, verbose=False) ) L, U, diag = compute_loo_corrected_bounds_exact_binomial(k_loo, n_cal, n_test, alpha, inflation_factor=infl) selected_method = "exact" elif method == "hoeffding": L, U, diag = compute_loo_corrected_bounds_hoeffding( loo_predictions, n_test, alpha, inflation_factor=inflation_factor, verbose=verbose ) selected_method = "hoeffding" elif method == "all": # Always estimate inflation factor for reporting # Use provided value if available, otherwise use estimated if inflation_factor is None: inflation_factor = estimated_inflation_factor if verbose: print(f"Using estimated LOO inflation factor: {inflation_factor:.3f} for comparison...") else: if verbose: print( "Using provided LOO inflation factor: " f"{inflation_factor:.3f} " "(estimated: " f"{estimated_inflation_factor:.3f}) for comparison..." ) # Compute all three methods while suppressing small-n and low-inflation warnings # specific to analytical method for method comparison runs with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message=r"^n_cal=\\d+ is very small|inflation_factor.*is very low", category=UserWarning, ) L1, U1, diag1 = compute_loo_corrected_bounds_analytical( loo_predictions, n_test, alpha, inflation_factor=inflation_factor ) # Use same inflation factor for exact method to ensure consistency across methods L2, U2, diag2 = compute_loo_corrected_bounds_exact_binomial( k_loo, n_cal, n_test, alpha, inflation_factor=inflation_factor ) L3, U3, diag3 = compute_loo_corrected_bounds_hoeffding( loo_predictions, n_test, alpha, inflation_factor=inflation_factor, verbose=verbose ) # Choose method based on reliability: # - If inflation factor < 1.2, prefer exact (analytical unreliable for low correlation) # - Otherwise prefer analytical if bounds are reasonable # - Auto-correct if analytical bounds are suspiciously narrow if effective_inflation_factor < 1.2: # Low correlation: prefer exact method (more reliable) L, U = L2, U2 selected_method = "exact (low correlation, inflation_factor < 1.2)" if verbose: print( f"Note: Selected exact method due to low correlation " f"(inflation_factor={effective_inflation_factor:.3f} < 1.2). " "Analytical method may be unreliable in this regime." ) elif (U1 - L1) < 0.7 * (U2 - L2): # Analytical bounds suspiciously narrow - auto-correct to exact L, U = L2, U2 selected_method = "exact (auto-corrected, analytical too narrow)" if verbose: print("Note: Auto-corrected to exact method - analytical bounds were suspiciously narrow.") else: # Analytical bounds reasonable L, U = L1, U1 selected_method = "analytical" # Build comparison table comparison = { "method": ["Analytical", "Exact Binomial", "Hoeffding"], "lower": [L1, L2, L3], "upper": [U1, U2, U3], "width": [U1 - L1, U2 - L2, U3 - L3], } report = { "selected_method": selected_method, "bounds": (L, U), "comparison": comparison, "diagnostics": {"analytical": diag1, "exact": diag2, "hoeffding": diag3}, "inflation_factor_used": inflation_factor if inflation_factor is not None else None, "inflation_factor_estimated": estimated_inflation_factor, } return L, U, report else: raise ValueError(f"Unknown method: {method}") # Build report - always include estimated inflation factor report = { "selected_method": selected_method, "bounds": (L, U), "diagnostics": diag, "alpha": alpha, "confidence_level": 1 - alpha, "inflation_factor_used": inflation_factor if inflation_factor is not None else None, "inflation_factor_estimated": estimated_inflation_factor, } return L, U, report
[docs] def format_prediction_bounds_report( rate_name: str, loo_predictions: np.ndarray, n_test: int, alpha: float = 0.05, include_all_methods: bool = True ) -> str: """ Generate a formatted text report of prediction bounds. This produces human-readable output suitable for inclusion in rigorous analysis reports. Parameters: ----------- rate_name : str Name of the rate (e.g., 'Singleton Rate', 'Doublet Rate') loo_predictions : np.ndarray Binary LOO predictions n_test : int Test set size alpha : float Significance level include_all_methods : bool If True, compare all three methods in report Returns: -------- report : str Formatted text report """ n_cal = len(loo_predictions) k_loo = int(np.sum(loo_predictions)) p_hat = k_loo / n_cal # Compute bounds if include_all_methods: L, U, results = compute_robust_prediction_bounds(loo_predictions, n_test, alpha, method="all") comp = results["comparison"] else: L, U, results = compute_robust_prediction_bounds(loo_predictions, n_test, alpha, method="auto") # Format report report_lines = [ f"\n{'=' * 70}", f"PREDICTION BOUNDS: {rate_name}", f"{'=' * 70}", "\nCalibration Data (LOO-CV):", f" Sample size: n_cal = {n_cal}", f" Successes: k = {k_loo}", f" Point estimate: p̂ = {p_hat:.4f} ({p_hat * 100:.2f}%)", "\nTest Data:", f" Expected test size: n_test = {n_test}", f"\nConfidence Level: {(1 - alpha) * 100:.1f}%", f"\n{'-' * 70}", "PREDICTION INTERVAL (accounts for all uncertainty sources):", f" Lower bound: L' = {L:.4f} ({L * 100:.2f}%)", f" Upper bound: U' = {U:.4f} ({U * 100:.2f}%)", f" Width: {U - L:.4f} ({(U - L) * 100:.2f}%)", f" Selected method: {results['selected_method']}", ] if include_all_methods and "comparison" in results: report_lines.extend( [ f"\n{'-' * 70}", "METHOD COMPARISON:", f" {'Method':<20} {'Lower':>10} {'Upper':>10} {'Width':>10}", f" {'-' * 20} {'-' * 10} {'-' * 10} {'-' * 10}", ] ) for i, method in enumerate(comp["method"]): L_i, U_i, W_i = comp["lower"][i], comp["upper"][i], comp["width"][i] report_lines.append(f" {method:<20} {L_i:>10.4f} {U_i:>10.4f} {W_i:>10.4f}") # Add uncertainty breakdown if "diagnostics" in results and "var_calibration" in results["diagnostics"]: diag = results["diagnostics"] report_lines.extend( [ f"\n{'-' * 70}", "UNCERTAINTY BREAKDOWN:", f" Calibration uncertainty: SE_cal = {np.sqrt(diag['var_calibration']):.4f}", f" Test sampling uncertainty: SE_test = {np.sqrt(diag['var_test']):.4f}", f" Total uncertainty: SE_total = {diag['se_total']:.4f}", f" LOO inflation factor: {diag['inflation_factor']:.2f}×", ] ) report_lines.extend( [ f"\n{'-' * 70}", "INTERPRETATION:", f" We are {(1 - alpha) * 100:.0f}% confident that future test sets of size {n_test}", f" will have {rate_name.lower()} between {L * 100:.2f}% and {U * 100:.2f}%.", "\n This interval accounts for:", " 1. LOO-CV correlation structure (variance inflation ≈2×)", " 2. Threshold calibration uncertainty", " 3. Parameter estimation uncertainty", " 4. Test set sampling variability", f"{'=' * 70}\n", ] ) return "\n".join(report_lines)