Source code for ssbc.metrics.loo_uncertainty

"""
Small-sample uncertainty quantification for LOO-CV conformal prediction.

This module handles all four sources of uncertainty:
1. LOO-CV correlation structure
2. Threshold calibration uncertainty
3. Parameter estimation uncertainty
4. Test sampling uncertainty

Designed for small calibration sets (n=20-40) where bootstrap is unreliable.
"""

import warnings
from typing import Any

import numpy as np
from scipy.stats import beta as beta_dist
from scipy.stats import binom, norm
from scipy.stats import t as t_dist



[docs]
def estimate_loo_inflation_factor(loo_predictions: np.ndarray, verbose: bool = True) -> float:
    """
    Estimate the actual variance inflation from LOO-CV for this specific problem.

    Theory predicts inflation ≈ 2×, but the actual value depends on the model
    and data structure. This computes an empirical estimate.

    Parameters:
    -----------
    loo_predictions : np.ndarray, shape (n_cal,)
        Binary LOO predictions (1=success, 0=failure)

    Returns:
    --------
    inflation_factor : float
        Estimated variance inflation, clipped to [1.0, 6.0]
        Typically ≈ 2.0 for LOO-CV

    Notes:
    ------
    Inflation = Var_empirical / Var_IID
    For n → ∞: inflation → 2.0
    For small n: inflation can vary, so we clip to reasonable range
    """
    n = len(loo_predictions)
    p_hat = np.mean(loo_predictions)

    # Empirical variance with Bessel correction
    var_empirical = np.var(loo_predictions, ddof=1) if n > 1 else p_hat * (1 - p_hat)

    # Theoretical IID variance
    var_iid = p_hat * (1 - p_hat)

    # Compute inflation factor
    if var_iid > 1e-10:  # Avoid division by zero
        inflation = (var_empirical / var_iid) * n / (n - 1)
    else:
        # Edge case: p_hat ≈ 0 or 1
        inflation = 2.0

    # Clip to reasonable range
    # Lower bound: 1.0 (can't be less than IID)
    # Upper bound: 6.0 (extended range for high correlation scenarios)
    inflation = np.clip(inflation, 1.0, 6.0)

    # Print the estimated inflation factor for visibility
    if verbose:
        print(f"LOO inflation factor estimated: {inflation:.3f} (clipped to [1.0, 6.0] range)")

    return inflation




[docs]
def compute_loo_corrected_bounds_analytical(
    loo_predictions: np.ndarray,
    n_test: int,
    alpha: float = 0.05,
    use_t_distribution: bool = True,
    inflation_factor: float | None = None,
) -> tuple[float, float, dict[str, Any]]:
    """
    METHOD 1: Analytical bounds with LOO correction.

    This method:
    - Uses empirical variance of LOO predictions (captures correlation)
    - Applies theoretical LOO inflation as safety check
    - Uses t-distribution for small-sample critical values
    - Combines calibration and test sampling uncertainty

    Best for: n_cal, n_test ≥ 40 AND inflation_factor ≥ 1.2

    Note: When inflation_factor ≈ 1 (low correlation), the normal approximation
    becomes unreliable. Use Method 2 (exact binomial) instead in these cases.

    Parameters:
    -----------
    loo_predictions : np.ndarray, shape (n_cal,)
        Binary LOO predictions
    n_test : int
        Size of future test sets
    alpha : float
        Significance level (e.g., 0.05 for 95% confidence)
    use_t_distribution : bool
        If True, use t-distribution (recommended for n < 50)
    inflation_factor : float or None
        Manual override for LOO inflation. If None, auto-estimated.

    Returns:
    --------
    L_prime : float
        Lower prediction bound
    U_prime : float
        Upper prediction bound
    diagnostics : dict
        Detailed breakdown of variance components
    """
    n_cal = len(loo_predictions)
    p_hat = np.mean(loo_predictions)

    # Small sample warning
    if n_cal < 20:
        warnings.warn(
            f"n_cal={n_cal} is very small. Consider using Method 2 (exact binomial) "
            "or Method 3 (Hoeffding) for more conservative bounds.",
            stacklevel=2,
        )

    # Inflation factor should already be estimated at the top level
    # This is just for backward compatibility

    # Variance components

    # Source #1 & #2: LOO variance with correlation correction
    # Method A: Empirical variance (captures actual correlation in data)
    s_squared = np.var(loo_predictions, ddof=1) if n_cal > 1 else p_hat * (1 - p_hat)
    var_calibration_empirical = s_squared / n_cal

    # Method B: Theoretical with inflation factor
    # For small n, use (n-1) in denominator for bias correction
    if inflation_factor is not None:
        var_calibration_theoretical = inflation_factor * p_hat * (1 - p_hat) / (n_cal - 1)
    else:
        # Fallback to empirical variance if no inflation factor provided
        var_calibration_theoretical = var_calibration_empirical

    # Use the LARGER of the two (conservative)
    var_calibration = max(var_calibration_empirical, var_calibration_theoretical)

    # Warn if inflation factor is very low - normal approximation may be unreliable
    if inflation_factor is not None and inflation_factor < 1.2:
        warnings.warn(
            f"inflation_factor={inflation_factor:.3f} is very low (< 1.2), indicating low LOO correlation. "
            "The analytical method using normal approximation may be unreliable in this regime. "
            "Consider using method='exact' (beta-binomial) for more accurate bounds.",
            UserWarning,
            stacklevel=2,
        )

    # Source #4: Test sampling variance
    var_test = p_hat * (1 - p_hat) / n_test

    # Total variance (sources are independent)
    var_total = var_calibration + var_test
    se_total = np.sqrt(var_total)

    # Critical value
    if use_t_distribution and n_cal > 2:
        # Use t-distribution with df = n_cal - 1
        df = n_cal - 1
        critical_value = t_dist.ppf(1 - alpha / 2, df)
    else:
        critical_value = norm.ppf(1 - alpha / 2)

    # Construct bounds
    L_prime = max(0.0, p_hat - critical_value * se_total)
    U_prime = min(1.0, p_hat + critical_value * se_total)

    # Diagnostics
    diagnostics = {
        "p_hat": p_hat,
        "n_cal": n_cal,
        "n_test": n_test,
        "inflation_factor": inflation_factor,
        "var_calibration": var_calibration,
        "var_test": var_test,
        "var_total": var_total,
        "se_total": se_total,
        "critical_value": critical_value,
        "width": U_prime - L_prime,
    }

    return L_prime, U_prime, diagnostics




[docs]
def compute_loo_corrected_bounds_exact_binomial(
    k_loo: int, n_cal: int, n_test: int, alpha: float = 0.05, inflation_factor: float = 2.0
) -> tuple[float, float, dict[str, float]]:
    """
    METHOD 2: Exact binomial with effective sample size (CONSERVATIVE).

    This method:
    - Uses exact beta/binomial distributions (no normal approximation)
    - Computes effective sample size accounting for LOO correlation
    - Uses worst-case union bound for combining uncertainties
    - Works directly with discrete probabilities

    Best for: n_cal = 20-40, when you need guaranteed coverage

    Parameters:
    -----------
    k_loo : int
        Number of LOO successes
    n_cal : int
        Calibration set size
    n_test : int
        Test set size
    alpha : float
        Significance level
    inflation_factor : float
        LOO variance inflation (typically 1.5-2.5)

    Returns:
    --------
    L_prime, U_prime : float
        Prediction bounds
    diagnostics : dict
        Detailed breakdown
    """
    p_hat = k_loo / n_cal

    # Effective sample size after accounting for LOO correlation
    # If inflation_factor = 2, then n_effective = n_cal / 2
    n_effective = n_cal / inflation_factor

    # Step 1: Wide confidence interval for p using effective sample size
    # Scale k to effective sample size for beta distribution
    k_effective = k_loo / inflation_factor
    n_effective_int = int(np.round(n_effective))
    k_effective_int = int(np.round(k_effective))

    # Split alpha budget: half for calibration CI, half for test sampling
    alpha_cal = alpha / 2
    alpha_test = alpha / 2

    # Clopper-Pearson bounds on effective sample
    if k_effective_int == 0:
        p_lower = 0.0
    else:
        p_lower = beta_dist.ppf(alpha_cal / 2, k_effective_int, n_effective_int - k_effective_int + 1)

    if k_effective_int == n_effective_int:
        p_upper = 1.0
    else:
        p_upper = beta_dist.ppf(1 - alpha_cal / 2, k_effective_int + 1, n_effective_int - k_effective_int)

    # Step 2: Worst-case test sampling at boundaries
    # Lower bound: assume p = p_lower, take pessimistic quantile
    if p_lower > 0:
        L_prime = binom.ppf(alpha_test / 2, n_test, p_lower) / n_test
    else:
        L_prime = 0.0

    # Upper bound: assume p = p_upper, take optimistic quantile
    if p_upper < 1:
        U_prime = binom.ppf(1 - alpha_test / 2, n_test, p_upper) / n_test
    else:
        U_prime = 1.0

    diagnostics = {
        "p_hat": p_hat,
        "n_cal": n_cal,
        "n_effective": n_effective,
        "n_test": n_test,
        "inflation_factor": inflation_factor,
        "p_lower": p_lower,
        "p_upper": p_upper,
        "width": U_prime - L_prime,
    }

    return L_prime, U_prime, diagnostics




[docs]
def compute_loo_corrected_bounds_hoeffding(
    loo_predictions: np.ndarray,
    n_test: int,
    alpha: float = 0.05,
    inflation_factor: float | None = None,
    verbose: bool = True,
) -> tuple[float, float, dict[str, Any]]:
    """
    METHOD 3: Distribution-free Hoeffding bound (ULTRA-CONSERVATIVE).

    This method:
    - Uses Hoeffding concentration inequality (no distributional assumptions)
    - Accounts for LOO correlation via adaptive effective sample size
    - Provides guaranteed coverage regardless of distribution
    - Widest bounds, suitable as worst-case / sanity check

    Best for: When you absolutely need guaranteed coverage

    Parameters:
    -----------
    loo_predictions : np.ndarray
        Binary LOO predictions
    n_test : int
        Test set size
    alpha : float
        Significance level
    inflation_factor : float, optional
        LOO correlation inflation factor. If None, uses conservative default of 2.0.

    Returns:
    --------
    L_prime, U_prime : float
        Prediction bounds
    diagnostics : dict
    """
    n_cal = len(loo_predictions)
    p_hat = np.mean(loo_predictions)

    # Use inflation factor if provided, otherwise use conservative default of 2.0
    if inflation_factor is None:
        inflation_factor = 2.0
        if verbose:
            print(f"Using conservative default inflation factor: {inflation_factor:.3f}")
    else:
        if verbose:
            print(f"Using provided inflation factor for Hoeffding: {inflation_factor:.3f}")

    # Effective sample size (accounts for LOO correlation)
    n_effective_cal = n_cal / inflation_factor
    n_effective_test = n_test

    # Hoeffding bound: P(|p̂ - p| > ε) ≤ 2 exp(-2nε²)
    # Setting 2 exp(-2nε²) = α/2, solve for ε:
    # ε = sqrt(log(4/α) / (2n))

    # Split alpha: half for calibration, half for test
    epsilon_cal = np.sqrt(np.log(4 / alpha) / (2 * n_effective_cal))
    epsilon_test = np.sqrt(np.log(4 / alpha) / (2 * n_effective_test))

    # Union bound: total epsilon
    epsilon_total = epsilon_cal + epsilon_test

    # Bounds
    L_prime = max(0.0, p_hat - epsilon_total)
    U_prime = min(1.0, p_hat + epsilon_total)

    diagnostics = {
        "p_hat": p_hat,
        "n_cal": n_cal,
        "inflation_factor": inflation_factor,
        "n_effective_cal": n_effective_cal,
        "n_test": n_test,
        "epsilon_cal": epsilon_cal,
        "epsilon_test": epsilon_test,
        "epsilon_total": epsilon_total,
        "width": U_prime - L_prime,
    }

    return L_prime, U_prime, diagnostics




[docs]
def compute_loo_corrected_prediction_bounds(
    loo_predictions: np.ndarray,
    n_test: int,
    alpha: float = 0.05,
    method: str = "simple",
    inflation_factor: float | None = None,
    verbose: bool = True,
) -> tuple[float, float, dict]:
    """Compute prediction bounds using Clopper-Pearson + sampling uncertainty with LOO correction.

    This function provides the correct statistical approach:
    1. Use Clopper-Pearson to bound the true rate p from calibration data
    2. Add sampling uncertainty for future test sets of size n_test
    3. Account for LOO-CV correlation structure

    Parameters
    ----------
    loo_predictions : np.ndarray, shape (n_cal,)
        Binary LOO predictions (1=success, 0=failure)
    n_test : int
        Expected size of future test sets
    alpha : float
        Significance level (e.g., 0.05 for 95% confidence)
    method : str
        'simple' - Clopper-Pearson + normal approximation for sampling uncertainty
        'beta_binomial' - Exact Beta-Binomial approach
        'hoeffding' - Distribution-free Hoeffding bounds
    inflation_factor : float, optional
        Manual override for LOO variance inflation factor
    verbose : bool, default=True
        Print diagnostic information

    Returns
    -------
    tuple[float, float, dict]
        (lower_bound, upper_bound, diagnostics)
    """
    n_cal = len(loo_predictions)
    k_cal = int(np.sum(loo_predictions))
    p_hat = np.mean(loo_predictions)

    if method == "simple":
        # Step 1: Clopper-Pearson bounds on true rate p (calibration uncertainty)
        from scipy.stats import beta as beta_dist

        if k_cal == 0:
            cp_lower = 0.0
            cp_upper = beta_dist.ppf(1 - alpha / 2, k_cal + 1, n_cal - k_cal)
        elif k_cal == n_cal:
            cp_lower = beta_dist.ppf(alpha / 2, k_cal, n_cal - k_cal + 1)
            cp_upper = 1.0
        else:
            cp_lower = beta_dist.ppf(alpha / 2, k_cal, n_cal - k_cal + 1)
            cp_upper = beta_dist.ppf(1 - alpha / 2, k_cal + 1, n_cal - k_cal)

        # Step 2: Add sampling uncertainty for test set
        # Use LOO-corrected variance if inflation factor provided
        if inflation_factor is None:
            inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False)

        # Sampling uncertainty: Var(X_test) = p(1-p)/n_test
        # Use conservative bounds from Clopper-Pearson
        sampling_var_lower = cp_lower * (1 - cp_lower) / n_test
        sampling_var_upper = cp_upper * (1 - cp_upper) / n_test

        # Total uncertainty: calibration + sampling
        se_lower = np.sqrt(sampling_var_lower)
        se_upper = np.sqrt(sampling_var_upper)

        # Normal approximation for sampling uncertainty
        z_critical = norm.ppf(1 - alpha / 2)

        lower = max(0.0, cp_lower - z_critical * se_lower)
        upper = min(1.0, cp_upper + z_critical * se_upper)

        diagnostics = {
            "method": "clopper_pearson_plus_sampling",
            "cp_lower": cp_lower,
            "cp_upper": cp_upper,
            "sampling_se_lower": se_lower,
            "sampling_se_upper": se_upper,
            "inflation_factor": inflation_factor,
            "z_critical": z_critical,
        }

    elif method == "beta_binomial":
        # Use exact Beta-Binomial approach with LOO correction
        if inflation_factor is None:
            inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False)

        # Effective sample size accounting for LOO correlation
        n_effective = n_cal / inflation_factor
        # Construct effective counts that preserve p_hat while ensuring integers
        n_eff_int = max(1, int(np.floor(n_effective)))
        k_eff_int = int(np.floor(p_hat * n_eff_int))

        # Use beta-binomial bounds with effective sample size
        from ssbc.bounds import prediction_bounds_beta_binomial

        lower, upper = prediction_bounds_beta_binomial(k_eff_int, n_eff_int, n_test, 1 - alpha)

        diagnostics = {
            "method": "beta_binomial_loo_corrected",
            "inflation_factor": inflation_factor,
            "n_effective": n_effective,
            "n_eff_int": n_eff_int,
            "k_eff_int": k_eff_int,
        }

    elif method == "hoeffding":
        # Use Hoeffding's inequality for distribution-free bounds
        if inflation_factor is None:
            inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False)

        # Hoeffding bound: P(|X - E[X]| >= t) <= 2 * exp(-2 * t^2 / n)
        # For prediction bounds, we need to account for both calibration and test uncertainty

        # Calibration uncertainty (LOO-corrected)
        # Hoeffding bound on calibration mean
        t_cal = np.sqrt(np.log(2 / alpha) / (2 * n_cal))
        cal_lower = max(0.0, p_hat - t_cal)
        cal_upper = min(1.0, p_hat + t_cal)

        # Test sampling uncertainty
        # Hoeffding bound on test set mean
        t_test = np.sqrt(np.log(2 / alpha) / (2 * n_test))

        # Conservative approach: use worst-case bounds
        lower = max(0.0, cal_lower - t_test)
        upper = min(1.0, cal_upper + t_test)

        diagnostics = {
            "method": "hoeffding_distribution_free",
            "inflation_factor": inflation_factor,
            "t_cal": t_cal,
            "t_test": t_test,
            "cal_lower": cal_lower,
            "cal_upper": cal_upper,
        }

    else:
        raise ValueError(f"Unknown method: {method}. Use 'simple', 'beta_binomial', or 'hoeffding'.")

    return lower, upper, diagnostics




[docs]
def compute_robust_prediction_bounds(
    loo_predictions: np.ndarray,
    n_test: int,
    alpha: float = 0.05,
    method: str = "auto",
    inflation_factor: float | None = None,
    verbose: bool = True,
) -> tuple[float, float, dict]:
    """
    Main function: Compute robust prediction bounds for small-sample LOO-CV.

    This is the primary entry point. It intelligently selects methods based on
    sample size and provides comprehensive diagnostics.

    Parameters:
    -----------
    loo_predictions : np.ndarray, shape (n_cal,)
        Binary LOO predictions (1=singleton/success, 0=not/failure)
    n_test : int
        Expected size of future test sets
    alpha : float
        Significance level (e.g., 0.05 for 95% confidence)
    method : str
        'auto' - Automatically select best method (recommended)
        'analytical' - Method 1: Analytical with LOO correction
        'exact' - Method 2: Exact binomial with effective n
        'hoeffding' - Method 3: Distribution-free bound
        'all' - Compute all three and report
    inflation_factor : float, optional
        Manual override for LOO variance inflation factor. If None, automatically estimated.
        Typical values: 1.0 (no inflation), 2.0 (standard LOO), 1.5-2.5 (empirical range)
    verbose : bool, default=True
        If True, print diagnostic information about method selection and inflation factors.

    Returns:
    --------
    L_prime : float
        Lower prediction bound
    U_prime : float
        Upper prediction bound
    report : dict
        Comprehensive diagnostics and method comparison

    Usage Examples:
    ---------------
    # Basic usage (auto-selects best method)
    L, U, report = compute_robust_prediction_bounds(loo_preds, n_test=50)

    # Force conservative method
    L, U, report = compute_robust_prediction_bounds(
        loo_preds, n_test=50, method='exact'
    )

    # Compare all methods
    L, U, report = compute_robust_prediction_bounds(
        loo_preds, n_test=50, method='all'
    )
    print(report['comparison_table'])
    """
    n_cal = len(loo_predictions)
    k_loo = int(np.sum(loo_predictions))

    # Always estimate inflation factor first (needed for method selection)
    estimated_inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False)

    # Use provided inflation factor if available, otherwise use estimated
    effective_inflation_factor = inflation_factor if inflation_factor is not None else estimated_inflation_factor

    # Auto-select method based on sample size AND inflation factor
    # When inflation_factor ≈ 1, normal approximation in analytical method is unreliable
    # Prefer exact (beta-binomial) method in these cases
    if method == "auto":
        # If inflation factor is very close to 1, analytical method unreliable
        # Use exact method instead (beta-binomial handles low correlation better)
        if effective_inflation_factor < 1.2:
            # Low correlation case: prefer exact method for reliability
            if n_cal >= 20:
                method = "exact"
            else:
                method = "hoeffding"
            if verbose:
                print(
                    f"Auto-selected LOO method: {method} "
                    f"(n_cal={n_cal}, inflation_factor={effective_inflation_factor:.3f} < 1.2, "
                    "analytical method unreliable for low correlation)"
                )
        elif n_cal >= 40:
            method = "analytical"
            if verbose:
                print(f"Auto-selected LOO method: {method} (n_cal={n_cal})")
        elif n_cal >= 25:
            method = "exact"
            if verbose:
                print(f"Auto-selected LOO method: {method} (n_cal={n_cal})")
        else:
            method = "hoeffding"
            if verbose:
                print(f"Auto-selected LOO method: {method} (n_cal={n_cal})")

    # Use inflation factor for calculations based on method requirements
    if inflation_factor is None:
        if method in ["analytical", "hoeffding"]:
            inflation_factor = estimated_inflation_factor
            if verbose:
                print(f"Using estimated LOO inflation factor: {inflation_factor:.3f}")
        else:
            # For exact method, still use estimated factor
            inflation_factor = estimated_inflation_factor
            if verbose:
                print(f"Using estimated LOO inflation factor: {inflation_factor:.3f} (exact method)")
    else:
        # User provided value - use it for calculations, but still report estimated
        if verbose:
            print(
                "Using provided LOO inflation factor: "
                f"{inflation_factor:.3f} "
                "(estimated from data: "
                f"{estimated_inflation_factor:.3f})"
            )

    # Compute bounds with selected method
    if method == "analytical":
        L, U, diag = compute_loo_corrected_bounds_analytical(
            loo_predictions, n_test, alpha, inflation_factor=inflation_factor
        )
        selected_method = "analytical"

    elif method == "exact":
        # Pass through provided/estimated inflation factor for consistency
        infl = (
            inflation_factor
            if inflation_factor is not None
            else estimate_loo_inflation_factor(loo_predictions, verbose=False)
        )
        L, U, diag = compute_loo_corrected_bounds_exact_binomial(k_loo, n_cal, n_test, alpha, inflation_factor=infl)
        selected_method = "exact"

    elif method == "hoeffding":
        L, U, diag = compute_loo_corrected_bounds_hoeffding(
            loo_predictions, n_test, alpha, inflation_factor=inflation_factor, verbose=verbose
        )
        selected_method = "hoeffding"

    elif method == "all":
        # Always estimate inflation factor for reporting
        # Use provided value if available, otherwise use estimated
        if inflation_factor is None:
            inflation_factor = estimated_inflation_factor
            if verbose:
                print(f"Using estimated LOO inflation factor: {inflation_factor:.3f} for comparison...")
        else:
            if verbose:
                print(
                    "Using provided LOO inflation factor: "
                    f"{inflation_factor:.3f} "
                    "(estimated: "
                    f"{estimated_inflation_factor:.3f}) for comparison..."
                )

        # Compute all three methods while suppressing small-n and low-inflation warnings
        # specific to analytical method for method comparison runs
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                message=r"^n_cal=\\d+ is very small|inflation_factor.*is very low",
                category=UserWarning,
            )
            L1, U1, diag1 = compute_loo_corrected_bounds_analytical(
                loo_predictions, n_test, alpha, inflation_factor=inflation_factor
            )
            # Use same inflation factor for exact method to ensure consistency across methods
            L2, U2, diag2 = compute_loo_corrected_bounds_exact_binomial(
                k_loo, n_cal, n_test, alpha, inflation_factor=inflation_factor
            )
            L3, U3, diag3 = compute_loo_corrected_bounds_hoeffding(
                loo_predictions, n_test, alpha, inflation_factor=inflation_factor, verbose=verbose
            )

        # Choose method based on reliability:
        # - If inflation factor < 1.2, prefer exact (analytical unreliable for low correlation)
        # - Otherwise prefer analytical if bounds are reasonable
        # - Auto-correct if analytical bounds are suspiciously narrow
        if effective_inflation_factor < 1.2:
            # Low correlation: prefer exact method (more reliable)
            L, U = L2, U2
            selected_method = "exact (low correlation, inflation_factor < 1.2)"
            if verbose:
                print(
                    f"Note: Selected exact method due to low correlation "
                    f"(inflation_factor={effective_inflation_factor:.3f} < 1.2). "
                    "Analytical method may be unreliable in this regime."
                )
        elif (U1 - L1) < 0.7 * (U2 - L2):
            # Analytical bounds suspiciously narrow - auto-correct to exact
            L, U = L2, U2
            selected_method = "exact (auto-corrected, analytical too narrow)"
            if verbose:
                print("Note: Auto-corrected to exact method - analytical bounds were suspiciously narrow.")
        else:
            # Analytical bounds reasonable
            L, U = L1, U1
            selected_method = "analytical"

        # Build comparison table
        comparison = {
            "method": ["Analytical", "Exact Binomial", "Hoeffding"],
            "lower": [L1, L2, L3],
            "upper": [U1, U2, U3],
            "width": [U1 - L1, U2 - L2, U3 - L3],
        }

        report = {
            "selected_method": selected_method,
            "bounds": (L, U),
            "comparison": comparison,
            "diagnostics": {"analytical": diag1, "exact": diag2, "hoeffding": diag3},
            "inflation_factor_used": inflation_factor if inflation_factor is not None else None,
            "inflation_factor_estimated": estimated_inflation_factor,
        }

        return L, U, report

    else:
        raise ValueError(f"Unknown method: {method}")

    # Build report - always include estimated inflation factor
    report = {
        "selected_method": selected_method,
        "bounds": (L, U),
        "diagnostics": diag,
        "alpha": alpha,
        "confidence_level": 1 - alpha,
        "inflation_factor_used": inflation_factor if inflation_factor is not None else None,
        "inflation_factor_estimated": estimated_inflation_factor,
    }

    return L, U, report




[docs]
def format_prediction_bounds_report(
    rate_name: str, loo_predictions: np.ndarray, n_test: int, alpha: float = 0.05, include_all_methods: bool = True
) -> str:
    """
    Generate a formatted text report of prediction bounds.

    This produces human-readable output suitable for inclusion in
    rigorous analysis reports.

    Parameters:
    -----------
    rate_name : str
        Name of the rate (e.g., 'Singleton Rate', 'Doublet Rate')
    loo_predictions : np.ndarray
        Binary LOO predictions
    n_test : int
        Test set size
    alpha : float
        Significance level
    include_all_methods : bool
        If True, compare all three methods in report

    Returns:
    --------
    report : str
        Formatted text report
    """
    n_cal = len(loo_predictions)
    k_loo = int(np.sum(loo_predictions))
    p_hat = k_loo / n_cal

    # Compute bounds
    if include_all_methods:
        L, U, results = compute_robust_prediction_bounds(loo_predictions, n_test, alpha, method="all")
        comp = results["comparison"]
    else:
        L, U, results = compute_robust_prediction_bounds(loo_predictions, n_test, alpha, method="auto")

    # Format report
    report_lines = [
        f"\n{'=' * 70}",
        f"PREDICTION BOUNDS: {rate_name}",
        f"{'=' * 70}",
        "\nCalibration Data (LOO-CV):",
        f"  Sample size:        n_cal = {n_cal}",
        f"  Successes:         k = {k_loo}",
        f"  Point estimate:    p̂ = {p_hat:.4f} ({p_hat * 100:.2f}%)",
        "\nTest Data:",
        f"  Expected test size: n_test = {n_test}",
        f"\nConfidence Level:    {(1 - alpha) * 100:.1f}%",
        f"\n{'-' * 70}",
        "PREDICTION INTERVAL (accounts for all uncertainty sources):",
        f"  Lower bound:       L' = {L:.4f} ({L * 100:.2f}%)",
        f"  Upper bound:       U' = {U:.4f} ({U * 100:.2f}%)",
        f"  Width:             {U - L:.4f} ({(U - L) * 100:.2f}%)",
        f"  Selected method:   {results['selected_method']}",
    ]

    if include_all_methods and "comparison" in results:
        report_lines.extend(
            [
                f"\n{'-' * 70}",
                "METHOD COMPARISON:",
                f"  {'Method':<20} {'Lower':>10} {'Upper':>10} {'Width':>10}",
                f"  {'-' * 20} {'-' * 10} {'-' * 10} {'-' * 10}",
            ]
        )
        for i, method in enumerate(comp["method"]):
            L_i, U_i, W_i = comp["lower"][i], comp["upper"][i], comp["width"][i]
            report_lines.append(f"  {method:<20} {L_i:>10.4f} {U_i:>10.4f} {W_i:>10.4f}")

    # Add uncertainty breakdown
    if "diagnostics" in results and "var_calibration" in results["diagnostics"]:
        diag = results["diagnostics"]
        report_lines.extend(
            [
                f"\n{'-' * 70}",
                "UNCERTAINTY BREAKDOWN:",
                f"  Calibration uncertainty:  SE_cal  = {np.sqrt(diag['var_calibration']):.4f}",
                f"  Test sampling uncertainty:   SE_test = {np.sqrt(diag['var_test']):.4f}",
                f"  Total uncertainty:         SE_total = {diag['se_total']:.4f}",
                f"  LOO inflation factor:      {diag['inflation_factor']:.2f}×",
            ]
        )

    report_lines.extend(
        [
            f"\n{'-' * 70}",
            "INTERPRETATION:",
            f"  We are {(1 - alpha) * 100:.0f}% confident that future test sets of size {n_test}",
            f"  will have {rate_name.lower()} between {L * 100:.2f}% and {U * 100:.2f}%.",
            "\n  This interval accounts for:",
            "    1. LOO-CV correlation structure (variance inflation ≈2×)",
            "    2. Threshold calibration uncertainty",
            "    3. Parameter estimation uncertainty",
            "    4. Test set sampling variability",
            f"{'=' * 70}\n",
        ]
    )

    return "\n".join(report_lines)