"""
Small-sample uncertainty quantification for LOO-CV conformal prediction.
This module handles all four sources of uncertainty:
1. LOO-CV correlation structure
2. Threshold calibration uncertainty
3. Parameter estimation uncertainty
4. Test sampling uncertainty
Designed for small calibration sets (n=20-40) where bootstrap is unreliable.
"""
import warnings
from typing import Any
import numpy as np
from scipy.stats import beta as beta_dist
from scipy.stats import binom, norm
from scipy.stats import t as t_dist
[docs]
def estimate_loo_inflation_factor(loo_predictions: np.ndarray, verbose: bool = True) -> float:
"""
Estimate the actual variance inflation from LOO-CV for this specific problem.
Theory predicts inflation ≈ 2×, but the actual value depends on the model
and data structure. This computes an empirical estimate.
Parameters:
-----------
loo_predictions : np.ndarray, shape (n_cal,)
Binary LOO predictions (1=success, 0=failure)
Returns:
--------
inflation_factor : float
Estimated variance inflation, clipped to [1.0, 6.0]
Typically ≈ 2.0 for LOO-CV
Notes:
------
Inflation = Var_empirical / Var_IID
For n → ∞: inflation → 2.0
For small n: inflation can vary, so we clip to reasonable range
"""
n = len(loo_predictions)
p_hat = np.mean(loo_predictions)
# Empirical variance with Bessel correction
var_empirical = np.var(loo_predictions, ddof=1) if n > 1 else p_hat * (1 - p_hat)
# Theoretical IID variance
var_iid = p_hat * (1 - p_hat)
# Compute inflation factor
if var_iid > 1e-10: # Avoid division by zero
inflation = (var_empirical / var_iid) * n / (n - 1)
else:
# Edge case: p_hat ≈ 0 or 1
inflation = 2.0
# Clip to reasonable range
# Lower bound: 1.0 (can't be less than IID)
# Upper bound: 6.0 (extended range for high correlation scenarios)
inflation = np.clip(inflation, 1.0, 6.0)
# Print the estimated inflation factor for visibility
if verbose:
print(f"LOO inflation factor estimated: {inflation:.3f} (clipped to [1.0, 6.0] range)")
return inflation
[docs]
def compute_loo_corrected_bounds_analytical(
loo_predictions: np.ndarray,
n_test: int,
alpha: float = 0.05,
use_t_distribution: bool = True,
inflation_factor: float | None = None,
) -> tuple[float, float, dict[str, Any]]:
"""
METHOD 1: Analytical bounds with LOO correction.
This method:
- Uses empirical variance of LOO predictions (captures correlation)
- Applies theoretical LOO inflation as safety check
- Uses t-distribution for small-sample critical values
- Combines calibration and test sampling uncertainty
Best for: n_cal, n_test ≥ 40 AND inflation_factor ≥ 1.2
Note: When inflation_factor ≈ 1 (low correlation), the normal approximation
becomes unreliable. Use Method 2 (exact binomial) instead in these cases.
Parameters:
-----------
loo_predictions : np.ndarray, shape (n_cal,)
Binary LOO predictions
n_test : int
Size of future test sets
alpha : float
Significance level (e.g., 0.05 for 95% confidence)
use_t_distribution : bool
If True, use t-distribution (recommended for n < 50)
inflation_factor : float or None
Manual override for LOO inflation. If None, auto-estimated.
Returns:
--------
L_prime : float
Lower prediction bound
U_prime : float
Upper prediction bound
diagnostics : dict
Detailed breakdown of variance components
"""
n_cal = len(loo_predictions)
p_hat = np.mean(loo_predictions)
# Small sample warning
if n_cal < 20:
warnings.warn(
f"n_cal={n_cal} is very small. Consider using Method 2 (exact binomial) "
"or Method 3 (Hoeffding) for more conservative bounds.",
stacklevel=2,
)
# Inflation factor should already be estimated at the top level
# This is just for backward compatibility
# Variance components
# Source #1 & #2: LOO variance with correlation correction
# Method A: Empirical variance (captures actual correlation in data)
s_squared = np.var(loo_predictions, ddof=1) if n_cal > 1 else p_hat * (1 - p_hat)
var_calibration_empirical = s_squared / n_cal
# Method B: Theoretical with inflation factor
# For small n, use (n-1) in denominator for bias correction
if inflation_factor is not None:
var_calibration_theoretical = inflation_factor * p_hat * (1 - p_hat) / (n_cal - 1)
else:
# Fallback to empirical variance if no inflation factor provided
var_calibration_theoretical = var_calibration_empirical
# Use the LARGER of the two (conservative)
var_calibration = max(var_calibration_empirical, var_calibration_theoretical)
# Warn if inflation factor is very low - normal approximation may be unreliable
if inflation_factor is not None and inflation_factor < 1.2:
warnings.warn(
f"inflation_factor={inflation_factor:.3f} is very low (< 1.2), indicating low LOO correlation. "
"The analytical method using normal approximation may be unreliable in this regime. "
"Consider using method='exact' (beta-binomial) for more accurate bounds.",
UserWarning,
stacklevel=2,
)
# Source #4: Test sampling variance
var_test = p_hat * (1 - p_hat) / n_test
# Total variance (sources are independent)
var_total = var_calibration + var_test
se_total = np.sqrt(var_total)
# Critical value
if use_t_distribution and n_cal > 2:
# Use t-distribution with df = n_cal - 1
df = n_cal - 1
critical_value = t_dist.ppf(1 - alpha / 2, df)
else:
critical_value = norm.ppf(1 - alpha / 2)
# Construct bounds
L_prime = max(0.0, p_hat - critical_value * se_total)
U_prime = min(1.0, p_hat + critical_value * se_total)
# Diagnostics
diagnostics = {
"p_hat": p_hat,
"n_cal": n_cal,
"n_test": n_test,
"inflation_factor": inflation_factor,
"var_calibration": var_calibration,
"var_test": var_test,
"var_total": var_total,
"se_total": se_total,
"critical_value": critical_value,
"width": U_prime - L_prime,
}
return L_prime, U_prime, diagnostics
[docs]
def compute_loo_corrected_bounds_exact_binomial(
k_loo: int, n_cal: int, n_test: int, alpha: float = 0.05, inflation_factor: float = 2.0
) -> tuple[float, float, dict[str, float]]:
"""
METHOD 2: Exact binomial with effective sample size (CONSERVATIVE).
This method:
- Uses exact beta/binomial distributions (no normal approximation)
- Computes effective sample size accounting for LOO correlation
- Uses worst-case union bound for combining uncertainties
- Works directly with discrete probabilities
Best for: n_cal = 20-40, when you need guaranteed coverage
Parameters:
-----------
k_loo : int
Number of LOO successes
n_cal : int
Calibration set size
n_test : int
Test set size
alpha : float
Significance level
inflation_factor : float
LOO variance inflation (typically 1.5-2.5)
Returns:
--------
L_prime, U_prime : float
Prediction bounds
diagnostics : dict
Detailed breakdown
"""
p_hat = k_loo / n_cal
# Effective sample size after accounting for LOO correlation
# If inflation_factor = 2, then n_effective = n_cal / 2
n_effective = n_cal / inflation_factor
# Step 1: Wide confidence interval for p using effective sample size
# Scale k to effective sample size for beta distribution
k_effective = k_loo / inflation_factor
n_effective_int = int(np.round(n_effective))
k_effective_int = int(np.round(k_effective))
# Split alpha budget: half for calibration CI, half for test sampling
alpha_cal = alpha / 2
alpha_test = alpha / 2
# Clopper-Pearson bounds on effective sample
if k_effective_int == 0:
p_lower = 0.0
else:
p_lower = beta_dist.ppf(alpha_cal / 2, k_effective_int, n_effective_int - k_effective_int + 1)
if k_effective_int == n_effective_int:
p_upper = 1.0
else:
p_upper = beta_dist.ppf(1 - alpha_cal / 2, k_effective_int + 1, n_effective_int - k_effective_int)
# Step 2: Worst-case test sampling at boundaries
# Lower bound: assume p = p_lower, take pessimistic quantile
if p_lower > 0:
L_prime = binom.ppf(alpha_test / 2, n_test, p_lower) / n_test
else:
L_prime = 0.0
# Upper bound: assume p = p_upper, take optimistic quantile
if p_upper < 1:
U_prime = binom.ppf(1 - alpha_test / 2, n_test, p_upper) / n_test
else:
U_prime = 1.0
diagnostics = {
"p_hat": p_hat,
"n_cal": n_cal,
"n_effective": n_effective,
"n_test": n_test,
"inflation_factor": inflation_factor,
"p_lower": p_lower,
"p_upper": p_upper,
"width": U_prime - L_prime,
}
return L_prime, U_prime, diagnostics
[docs]
def compute_loo_corrected_bounds_hoeffding(
loo_predictions: np.ndarray,
n_test: int,
alpha: float = 0.05,
inflation_factor: float | None = None,
verbose: bool = True,
) -> tuple[float, float, dict[str, Any]]:
"""
METHOD 3: Distribution-free Hoeffding bound (ULTRA-CONSERVATIVE).
This method:
- Uses Hoeffding concentration inequality (no distributional assumptions)
- Accounts for LOO correlation via adaptive effective sample size
- Provides guaranteed coverage regardless of distribution
- Widest bounds, suitable as worst-case / sanity check
Best for: When you absolutely need guaranteed coverage
Parameters:
-----------
loo_predictions : np.ndarray
Binary LOO predictions
n_test : int
Test set size
alpha : float
Significance level
inflation_factor : float, optional
LOO correlation inflation factor. If None, uses conservative default of 2.0.
Returns:
--------
L_prime, U_prime : float
Prediction bounds
diagnostics : dict
"""
n_cal = len(loo_predictions)
p_hat = np.mean(loo_predictions)
# Use inflation factor if provided, otherwise use conservative default of 2.0
if inflation_factor is None:
inflation_factor = 2.0
if verbose:
print(f"Using conservative default inflation factor: {inflation_factor:.3f}")
else:
if verbose:
print(f"Using provided inflation factor for Hoeffding: {inflation_factor:.3f}")
# Effective sample size (accounts for LOO correlation)
n_effective_cal = n_cal / inflation_factor
n_effective_test = n_test
# Hoeffding bound: P(|p̂ - p| > ε) ≤ 2 exp(-2nε²)
# Setting 2 exp(-2nε²) = α/2, solve for ε:
# ε = sqrt(log(4/α) / (2n))
# Split alpha: half for calibration, half for test
epsilon_cal = np.sqrt(np.log(4 / alpha) / (2 * n_effective_cal))
epsilon_test = np.sqrt(np.log(4 / alpha) / (2 * n_effective_test))
# Union bound: total epsilon
epsilon_total = epsilon_cal + epsilon_test
# Bounds
L_prime = max(0.0, p_hat - epsilon_total)
U_prime = min(1.0, p_hat + epsilon_total)
diagnostics = {
"p_hat": p_hat,
"n_cal": n_cal,
"inflation_factor": inflation_factor,
"n_effective_cal": n_effective_cal,
"n_test": n_test,
"epsilon_cal": epsilon_cal,
"epsilon_test": epsilon_test,
"epsilon_total": epsilon_total,
"width": U_prime - L_prime,
}
return L_prime, U_prime, diagnostics
[docs]
def compute_loo_corrected_prediction_bounds(
loo_predictions: np.ndarray,
n_test: int,
alpha: float = 0.05,
method: str = "simple",
inflation_factor: float | None = None,
verbose: bool = True,
) -> tuple[float, float, dict]:
"""Compute prediction bounds using Clopper-Pearson + sampling uncertainty with LOO correction.
This function provides the correct statistical approach:
1. Use Clopper-Pearson to bound the true rate p from calibration data
2. Add sampling uncertainty for future test sets of size n_test
3. Account for LOO-CV correlation structure
Parameters
----------
loo_predictions : np.ndarray, shape (n_cal,)
Binary LOO predictions (1=success, 0=failure)
n_test : int
Expected size of future test sets
alpha : float
Significance level (e.g., 0.05 for 95% confidence)
method : str
'simple' - Clopper-Pearson + normal approximation for sampling uncertainty
'beta_binomial' - Exact Beta-Binomial approach
'hoeffding' - Distribution-free Hoeffding bounds
inflation_factor : float, optional
Manual override for LOO variance inflation factor
verbose : bool, default=True
Print diagnostic information
Returns
-------
tuple[float, float, dict]
(lower_bound, upper_bound, diagnostics)
"""
n_cal = len(loo_predictions)
k_cal = int(np.sum(loo_predictions))
p_hat = np.mean(loo_predictions)
if method == "simple":
# Step 1: Clopper-Pearson bounds on true rate p (calibration uncertainty)
from scipy.stats import beta as beta_dist
if k_cal == 0:
cp_lower = 0.0
cp_upper = beta_dist.ppf(1 - alpha / 2, k_cal + 1, n_cal - k_cal)
elif k_cal == n_cal:
cp_lower = beta_dist.ppf(alpha / 2, k_cal, n_cal - k_cal + 1)
cp_upper = 1.0
else:
cp_lower = beta_dist.ppf(alpha / 2, k_cal, n_cal - k_cal + 1)
cp_upper = beta_dist.ppf(1 - alpha / 2, k_cal + 1, n_cal - k_cal)
# Step 2: Add sampling uncertainty for test set
# Use LOO-corrected variance if inflation factor provided
if inflation_factor is None:
inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False)
# Sampling uncertainty: Var(X_test) = p(1-p)/n_test
# Use conservative bounds from Clopper-Pearson
sampling_var_lower = cp_lower * (1 - cp_lower) / n_test
sampling_var_upper = cp_upper * (1 - cp_upper) / n_test
# Total uncertainty: calibration + sampling
se_lower = np.sqrt(sampling_var_lower)
se_upper = np.sqrt(sampling_var_upper)
# Normal approximation for sampling uncertainty
z_critical = norm.ppf(1 - alpha / 2)
lower = max(0.0, cp_lower - z_critical * se_lower)
upper = min(1.0, cp_upper + z_critical * se_upper)
diagnostics = {
"method": "clopper_pearson_plus_sampling",
"cp_lower": cp_lower,
"cp_upper": cp_upper,
"sampling_se_lower": se_lower,
"sampling_se_upper": se_upper,
"inflation_factor": inflation_factor,
"z_critical": z_critical,
}
elif method == "beta_binomial":
# Use exact Beta-Binomial approach with LOO correction
if inflation_factor is None:
inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False)
# Effective sample size accounting for LOO correlation
n_effective = n_cal / inflation_factor
# Construct effective counts that preserve p_hat while ensuring integers
n_eff_int = max(1, int(np.floor(n_effective)))
k_eff_int = int(np.floor(p_hat * n_eff_int))
# Use beta-binomial bounds with effective sample size
from ssbc.bounds import prediction_bounds_beta_binomial
lower, upper = prediction_bounds_beta_binomial(k_eff_int, n_eff_int, n_test, 1 - alpha)
diagnostics = {
"method": "beta_binomial_loo_corrected",
"inflation_factor": inflation_factor,
"n_effective": n_effective,
"n_eff_int": n_eff_int,
"k_eff_int": k_eff_int,
}
elif method == "hoeffding":
# Use Hoeffding's inequality for distribution-free bounds
if inflation_factor is None:
inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False)
# Hoeffding bound: P(|X - E[X]| >= t) <= 2 * exp(-2 * t^2 / n)
# For prediction bounds, we need to account for both calibration and test uncertainty
# Calibration uncertainty (LOO-corrected)
# Hoeffding bound on calibration mean
t_cal = np.sqrt(np.log(2 / alpha) / (2 * n_cal))
cal_lower = max(0.0, p_hat - t_cal)
cal_upper = min(1.0, p_hat + t_cal)
# Test sampling uncertainty
# Hoeffding bound on test set mean
t_test = np.sqrt(np.log(2 / alpha) / (2 * n_test))
# Conservative approach: use worst-case bounds
lower = max(0.0, cal_lower - t_test)
upper = min(1.0, cal_upper + t_test)
diagnostics = {
"method": "hoeffding_distribution_free",
"inflation_factor": inflation_factor,
"t_cal": t_cal,
"t_test": t_test,
"cal_lower": cal_lower,
"cal_upper": cal_upper,
}
else:
raise ValueError(f"Unknown method: {method}. Use 'simple', 'beta_binomial', or 'hoeffding'.")
return lower, upper, diagnostics
[docs]
def compute_robust_prediction_bounds(
loo_predictions: np.ndarray,
n_test: int,
alpha: float = 0.05,
method: str = "auto",
inflation_factor: float | None = None,
verbose: bool = True,
) -> tuple[float, float, dict]:
"""
Main function: Compute robust prediction bounds for small-sample LOO-CV.
This is the primary entry point. It intelligently selects methods based on
sample size and provides comprehensive diagnostics.
Parameters:
-----------
loo_predictions : np.ndarray, shape (n_cal,)
Binary LOO predictions (1=singleton/success, 0=not/failure)
n_test : int
Expected size of future test sets
alpha : float
Significance level (e.g., 0.05 for 95% confidence)
method : str
'auto' - Automatically select best method (recommended)
'analytical' - Method 1: Analytical with LOO correction
'exact' - Method 2: Exact binomial with effective n
'hoeffding' - Method 3: Distribution-free bound
'all' - Compute all three and report
inflation_factor : float, optional
Manual override for LOO variance inflation factor. If None, automatically estimated.
Typical values: 1.0 (no inflation), 2.0 (standard LOO), 1.5-2.5 (empirical range)
verbose : bool, default=True
If True, print diagnostic information about method selection and inflation factors.
Returns:
--------
L_prime : float
Lower prediction bound
U_prime : float
Upper prediction bound
report : dict
Comprehensive diagnostics and method comparison
Usage Examples:
---------------
# Basic usage (auto-selects best method)
L, U, report = compute_robust_prediction_bounds(loo_preds, n_test=50)
# Force conservative method
L, U, report = compute_robust_prediction_bounds(
loo_preds, n_test=50, method='exact'
)
# Compare all methods
L, U, report = compute_robust_prediction_bounds(
loo_preds, n_test=50, method='all'
)
print(report['comparison_table'])
"""
n_cal = len(loo_predictions)
k_loo = int(np.sum(loo_predictions))
# Always estimate inflation factor first (needed for method selection)
estimated_inflation_factor = estimate_loo_inflation_factor(loo_predictions, verbose=False)
# Use provided inflation factor if available, otherwise use estimated
effective_inflation_factor = inflation_factor if inflation_factor is not None else estimated_inflation_factor
# Auto-select method based on sample size AND inflation factor
# When inflation_factor ≈ 1, normal approximation in analytical method is unreliable
# Prefer exact (beta-binomial) method in these cases
if method == "auto":
# If inflation factor is very close to 1, analytical method unreliable
# Use exact method instead (beta-binomial handles low correlation better)
if effective_inflation_factor < 1.2:
# Low correlation case: prefer exact method for reliability
if n_cal >= 20:
method = "exact"
else:
method = "hoeffding"
if verbose:
print(
f"Auto-selected LOO method: {method} "
f"(n_cal={n_cal}, inflation_factor={effective_inflation_factor:.3f} < 1.2, "
"analytical method unreliable for low correlation)"
)
elif n_cal >= 40:
method = "analytical"
if verbose:
print(f"Auto-selected LOO method: {method} (n_cal={n_cal})")
elif n_cal >= 25:
method = "exact"
if verbose:
print(f"Auto-selected LOO method: {method} (n_cal={n_cal})")
else:
method = "hoeffding"
if verbose:
print(f"Auto-selected LOO method: {method} (n_cal={n_cal})")
# Use inflation factor for calculations based on method requirements
if inflation_factor is None:
if method in ["analytical", "hoeffding"]:
inflation_factor = estimated_inflation_factor
if verbose:
print(f"Using estimated LOO inflation factor: {inflation_factor:.3f}")
else:
# For exact method, still use estimated factor
inflation_factor = estimated_inflation_factor
if verbose:
print(f"Using estimated LOO inflation factor: {inflation_factor:.3f} (exact method)")
else:
# User provided value - use it for calculations, but still report estimated
if verbose:
print(
"Using provided LOO inflation factor: "
f"{inflation_factor:.3f} "
"(estimated from data: "
f"{estimated_inflation_factor:.3f})"
)
# Compute bounds with selected method
if method == "analytical":
L, U, diag = compute_loo_corrected_bounds_analytical(
loo_predictions, n_test, alpha, inflation_factor=inflation_factor
)
selected_method = "analytical"
elif method == "exact":
# Pass through provided/estimated inflation factor for consistency
infl = (
inflation_factor
if inflation_factor is not None
else estimate_loo_inflation_factor(loo_predictions, verbose=False)
)
L, U, diag = compute_loo_corrected_bounds_exact_binomial(k_loo, n_cal, n_test, alpha, inflation_factor=infl)
selected_method = "exact"
elif method == "hoeffding":
L, U, diag = compute_loo_corrected_bounds_hoeffding(
loo_predictions, n_test, alpha, inflation_factor=inflation_factor, verbose=verbose
)
selected_method = "hoeffding"
elif method == "all":
# Always estimate inflation factor for reporting
# Use provided value if available, otherwise use estimated
if inflation_factor is None:
inflation_factor = estimated_inflation_factor
if verbose:
print(f"Using estimated LOO inflation factor: {inflation_factor:.3f} for comparison...")
else:
if verbose:
print(
"Using provided LOO inflation factor: "
f"{inflation_factor:.3f} "
"(estimated: "
f"{estimated_inflation_factor:.3f}) for comparison..."
)
# Compute all three methods while suppressing small-n and low-inflation warnings
# specific to analytical method for method comparison runs
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message=r"^n_cal=\\d+ is very small|inflation_factor.*is very low",
category=UserWarning,
)
L1, U1, diag1 = compute_loo_corrected_bounds_analytical(
loo_predictions, n_test, alpha, inflation_factor=inflation_factor
)
# Use same inflation factor for exact method to ensure consistency across methods
L2, U2, diag2 = compute_loo_corrected_bounds_exact_binomial(
k_loo, n_cal, n_test, alpha, inflation_factor=inflation_factor
)
L3, U3, diag3 = compute_loo_corrected_bounds_hoeffding(
loo_predictions, n_test, alpha, inflation_factor=inflation_factor, verbose=verbose
)
# Choose method based on reliability:
# - If inflation factor < 1.2, prefer exact (analytical unreliable for low correlation)
# - Otherwise prefer analytical if bounds are reasonable
# - Auto-correct if analytical bounds are suspiciously narrow
if effective_inflation_factor < 1.2:
# Low correlation: prefer exact method (more reliable)
L, U = L2, U2
selected_method = "exact (low correlation, inflation_factor < 1.2)"
if verbose:
print(
f"Note: Selected exact method due to low correlation "
f"(inflation_factor={effective_inflation_factor:.3f} < 1.2). "
"Analytical method may be unreliable in this regime."
)
elif (U1 - L1) < 0.7 * (U2 - L2):
# Analytical bounds suspiciously narrow - auto-correct to exact
L, U = L2, U2
selected_method = "exact (auto-corrected, analytical too narrow)"
if verbose:
print("Note: Auto-corrected to exact method - analytical bounds were suspiciously narrow.")
else:
# Analytical bounds reasonable
L, U = L1, U1
selected_method = "analytical"
# Build comparison table
comparison = {
"method": ["Analytical", "Exact Binomial", "Hoeffding"],
"lower": [L1, L2, L3],
"upper": [U1, U2, U3],
"width": [U1 - L1, U2 - L2, U3 - L3],
}
report = {
"selected_method": selected_method,
"bounds": (L, U),
"comparison": comparison,
"diagnostics": {"analytical": diag1, "exact": diag2, "hoeffding": diag3},
"inflation_factor_used": inflation_factor if inflation_factor is not None else None,
"inflation_factor_estimated": estimated_inflation_factor,
}
return L, U, report
else:
raise ValueError(f"Unknown method: {method}")
# Build report - always include estimated inflation factor
report = {
"selected_method": selected_method,
"bounds": (L, U),
"diagnostics": diag,
"alpha": alpha,
"confidence_level": 1 - alpha,
"inflation_factor_used": inflation_factor if inflation_factor is not None else None,
"inflation_factor_estimated": estimated_inflation_factor,
}
return L, U, report