Source code for ssbc.metrics.operational_bounds_simple

"""Simplified operational bounds for fixed calibration (LOO-CV + CP)."""

import os

import numpy as np
from joblib import Parallel, delayed

from ssbc.bounds import prediction_bounds
from ssbc.core_pkg import SSBCResult

from .loo_uncertainty import compute_robust_prediction_bounds


def _effective_n_jobs(requested_n_jobs: int, n_tasks: int) -> int:
    """Choose a sane level of parallelism to avoid oversubscription on large machines.

    Caps the number of workers by the number of tasks and a hard ceiling to
    reduce process scheduling overhead on 100+ core hosts.
    """
    if requested_n_jobs in (None, 0, 1):
        return 1
    if requested_n_jobs < 0:
        cpu_total = os.cpu_count() or 1
        cap = 32  # hard cap to avoid spawning hundreds of processes
        return max(1, min(cpu_total, cap, n_tasks))
    return max(1, min(int(requested_n_jobs), n_tasks))


def _safe_parallel_map(n_jobs: int, func, iterable, backend: str = "loky"):
    """Execute jobs in parallel if possible, otherwise fall back to serial.

    This avoids sandbox/system-limit failures (e.g., PermissionError from loky)
    by retrying in-process serial execution when multiprocessing is unavailable.
    """
    try:
        return Parallel(n_jobs=n_jobs, backend=backend)(delayed(func)(*args) for args in iterable)
    except Exception:
        # Fallback to serial execution
        return [func(*args) for args in iterable]


def _evaluate_loo_single_sample_marginal(
    idx: int,
    labels: np.ndarray,
    probs: np.ndarray,
    k_0: int,
    k_1: int,
) -> tuple[int, int, int, int, int]:
    """Evaluate single LOO fold for marginal operational rates.

    Parameters
    ----------
    k_0, k_1 : int
        Quantile positions (1-indexed) from SSBC calibration

    Returns
    -------
    tuple[int, int, int, int, int]
        (is_singleton, is_doublet, is_abstention, is_singleton_correct, singleton_predicted_class)
        singleton_predicted_class: 0 if singleton predicted as class 0, 1 if class 1, -1 if not singleton
    """
    mask_0 = labels == 0
    mask_1 = labels == 1

    # Compute LOO thresholds (using FIXED k positions)
    # Class 0
    if mask_0[idx]:
        scores_0_loo = 1.0 - probs[mask_0, 0]
        mask_0_idx = np.where(mask_0)[0]
        loo_position = np.where(mask_0_idx == idx)[0][0]
        scores_0_loo = np.delete(scores_0_loo, loo_position)
    else:
        scores_0_loo = 1.0 - probs[mask_0, 0]

    sorted_0_loo = np.sort(scores_0_loo)
    threshold_0_loo = sorted_0_loo[min(k_0 - 1, len(sorted_0_loo) - 1)]

    # Class 1
    if mask_1[idx]:
        scores_1_loo = 1.0 - probs[mask_1, 1]
        mask_1_idx = np.where(mask_1)[0]
        loo_position = np.where(mask_1_idx == idx)[0][0]
        scores_1_loo = np.delete(scores_1_loo, loo_position)
    else:
        scores_1_loo = 1.0 - probs[mask_1, 1]

    sorted_1_loo = np.sort(scores_1_loo)
    threshold_1_loo = sorted_1_loo[min(k_1 - 1, len(sorted_1_loo) - 1)]

    # Evaluate on held-out sample
    score_0 = 1.0 - probs[idx, 0]
    score_1 = 1.0 - probs[idx, 1]
    true_label = labels[idx]

    in_0 = score_0 <= threshold_0_loo
    in_1 = score_1 <= threshold_1_loo

    # Determine prediction set type
    if in_0 and in_1:
        is_singleton, is_doublet, is_abstention = 0, 1, 0
        is_singleton_correct = 0
        singleton_predicted_class = -1  # Not a singleton
    elif in_0 or in_1:
        is_singleton, is_doublet, is_abstention = 1, 0, 0
        # Determine which class was predicted (only one of in_0 or in_1 is True for singleton)
        if in_0 and not in_1:
            singleton_predicted_class = 0
        elif in_1 and not in_0:
            singleton_predicted_class = 1
        else:
            singleton_predicted_class = -1  # Should not happen
        is_singleton_correct = 1 if (in_0 and true_label == 0) or (in_1 and true_label == 1) else 0
    else:
        is_singleton, is_doublet, is_abstention = 0, 0, 1
        is_singleton_correct = 0
        singleton_predicted_class = -1  # Not a singleton

    return is_singleton, is_doublet, is_abstention, is_singleton_correct, singleton_predicted_class


[docs] def compute_pac_operational_bounds_marginal( ssbc_result_0: SSBCResult, ssbc_result_1: SSBCResult, labels: np.ndarray, probs: np.ndarray, test_size: int, # Now used for prediction bounds ci_level: float = 0.95, pac_level: float = 0.95, # Kept for API compatibility (not used) use_union_bound: bool = True, n_jobs: int = -1, prediction_method: str = "simple", ) -> dict: """Compute marginal operational bounds for FIXED calibration via LOO-CV. Enhanced approach: 1. Use FIXED u_star positions from SSBC calibration 2. Run LOO-CV to get unbiased rate estimates 3. Apply prediction bounds accounting for both calibration and test set sampling uncertainty 4. Optional union bound for simultaneous guarantees This models: "Given fixed calibration, what are rate distributions on future test sets?" The prediction bounds account for both calibration uncertainty and test set sampling variability. Parameters ---------- ssbc_result_0 : SSBCResult SSBC result for class 0 ssbc_result_1 : SSBCResult SSBC result for class 1 labels : np.ndarray True labels probs : np.ndarray Predicted probabilities test_size : int Expected test set size for prediction bounds. Used to account for test set sampling uncertainty. ci_level : float, default=0.95 Confidence level for prediction bounds use_union_bound : bool, default=True Apply Bonferroni for simultaneous guarantees n_jobs : int, default=-1 Number of parallel jobs (-1 = all cores) Returns ------- dict Operational bounds with keys: - 'singleton_rate_bounds': [L, U] - 'doublet_rate_bounds': [L, U] - 'abstention_rate_bounds': [L, U] - 'singleton_error_rate_class0_bounds': [L, U] - 'singleton_error_rate_class1_bounds': [L, U] - 'singleton_error_rate_cond_class0_bounds': [L, U] - 'singleton_error_rate_cond_class1_bounds': [L, U] - 'expected_*_rate': point estimates Note: Marginal singleton_error_rate_bounds is NOT computed because it mixes two different distributions (class 0 and class 1) which cannot be justified statistically. """ n = len(labels) # Compute k (quantile position) from SSBC-corrected alpha # k = ceil((n_class + 1) * (1 - alpha_corrected)) n_0 = ssbc_result_0.n n_1 = ssbc_result_1.n k_0 = int(np.ceil((n_0 + 1) * (1 - ssbc_result_0.alpha_corrected))) k_1 = int(np.ceil((n_1 + 1) * (1 - ssbc_result_1.alpha_corrected))) # Parallel LOO-CV: evaluate each sample eff_jobs = _effective_n_jobs(n_jobs, n) results = _safe_parallel_map( eff_jobs, _evaluate_loo_single_sample_marginal, ((idx, labels, probs, k_0, k_1) for idx in range(n)), ) # Aggregate results results_array = np.array(results) n_singletons = int(np.sum(results_array[:, 0])) n_doublets = int(np.sum(results_array[:, 1])) n_abstentions = int(np.sum(results_array[:, 2])) # Point estimates singleton_rate = n_singletons / n doublet_rate = n_doublets / n abstention_rate = n_abstentions / n # Class-specific rates (normalized against full dataset) n_singletons_class0_total = int(np.sum((results_array[:, 0] == 1) & (labels == 0))) n_singletons_class1_total = int(np.sum((results_array[:, 0] == 1) & (labels == 1))) n_doublets_class0_total = int(np.sum((results_array[:, 1] == 1) & (labels == 0))) n_doublets_class1_total = int(np.sum((results_array[:, 1] == 1) & (labels == 1))) n_abstentions_class0_total = int(np.sum((results_array[:, 2] == 1) & (labels == 0))) n_abstentions_class1_total = int(np.sum((results_array[:, 2] == 1) & (labels == 1))) singleton_rate_class0 = n_singletons_class0_total / n if n > 0 else 0.0 singleton_rate_class1 = n_singletons_class1_total / n if n > 0 else 0.0 doublet_rate_class0 = n_doublets_class0_total / n if n > 0 else 0.0 doublet_rate_class1 = n_doublets_class1_total / n if n > 0 else 0.0 abstention_rate_class0 = n_abstentions_class0_total / n if n > 0 else 0.0 abstention_rate_class1 = n_abstentions_class1_total / n if n > 0 else 0.0 # Class-specific singleton error rates (normalized against full dataset) n_errors_class0 = int(np.sum((results_array[:, 0] == 1) & (labels == 0) & (results_array[:, 3] == 0))) n_errors_class1 = int(np.sum((results_array[:, 0] == 1) & (labels == 1) & (results_array[:, 3] == 0))) singleton_error_rate_class0 = n_errors_class0 / n if n > 0 else 0.0 singleton_error_rate_class1 = n_errors_class1 / n if n > 0 else 0.0 # Conditional error rates: P(error | singleton & class) singleton_class0_mask = (results_array[:, 0] == 1) & (labels == 0) singleton_class1_mask = (results_array[:, 0] == 1) & (labels == 1) n_singletons_class0 = int(np.sum(singleton_class0_mask)) n_singletons_class1 = int(np.sum(singleton_class1_mask)) singleton_error_rate_cond_class0 = n_errors_class0 / n_singletons_class0 if n_singletons_class0 > 0 else 0.0 singleton_error_rate_cond_class1 = n_errors_class1 / n_singletons_class1 if n_singletons_class1 > 0 else 0.0 # Apply prediction bounds accounting for both calibration and test set sampling uncertainty # These bound operational rates on future test sets of size test_size # SE = sqrt(p̂(1-p̂) * (1/n_cal + 1/n_test)) accounts for both sources of uncertainty # Now we have 13 metrics: singleton, doublet, abstention, # singleton_class0 (normalized), singleton_class1 (normalized), # doublet_class0 (normalized), doublet_class1 (normalized), # abstention_class0 (normalized), abstention_class1 (normalized), # error_class0 (normalized), error_class1 (normalized), error_cond_class0, error_cond_class1 # Note: We do NOT compute marginal singleton_error because it mixes two different # distributions (class 0 and class 1) which cannot be justified statistically. n_metrics = 13 if use_union_bound: adjusted_ci_level = 1 - (1 - ci_level) / n_metrics else: adjusted_ci_level = ci_level # Use prediction bounds instead of Clopper-Pearson for operational rates singleton_lower, singleton_upper = prediction_bounds( n_singletons, n, test_size, adjusted_ci_level, prediction_method ) doublet_lower, doublet_upper = prediction_bounds(n_doublets, n, test_size, adjusted_ci_level, prediction_method) abstention_lower, abstention_upper = prediction_bounds( n_abstentions, n, test_size, adjusted_ci_level, prediction_method ) # Class-specific singleton rates (normalized against full dataset) # Bernoulli event: Z_i^{sing,0} = 1{Y_i=0, S_i=singleton} # Mean: θ_0^{sing} = P(Y=0, S=singleton) # k_cal: count of class-0 singletons in calibration # n_cal: total calibration size (fixed denominator) # n_test: planned test size (fixed) singleton_class0_lower, singleton_class0_upper = prediction_bounds( n_singletons_class0_total, n, test_size, adjusted_ci_level, prediction_method ) # Bernoulli event: Z_i^{sing,1} = 1{Y_i=1, S_i=singleton} singleton_class1_lower, singleton_class1_upper = prediction_bounds( n_singletons_class1_total, n, test_size, adjusted_ci_level, prediction_method ) # Class-specific doublet rates (normalized against full dataset) doublet_class0_lower, doublet_class0_upper = prediction_bounds( n_doublets_class0_total, n, test_size, adjusted_ci_level, prediction_method ) doublet_class1_lower, doublet_class1_upper = prediction_bounds( n_doublets_class1_total, n, test_size, adjusted_ci_level, prediction_method ) # Class-specific abstention rates (normalized against full dataset) abstention_class0_lower, abstention_class0_upper = prediction_bounds( n_abstentions_class0_total, n, test_size, adjusted_ci_level, prediction_method ) abstention_class1_lower, abstention_class1_upper = prediction_bounds( n_abstentions_class1_total, n, test_size, adjusted_ci_level, prediction_method ) # Class-specific singleton error rates (normalized against full dataset) # Bernoulli event: Z_i^{err,0} = 1{Y_i=0, S_i=singleton, E_i=1} # Mean: θ_0^{err} = P(Y=0, S=singleton, E=1) # k_cal: count of class-0 singleton errors in calibration # n_cal: total calibration size (fixed denominator) # n_test: planned test size (fixed) error_class0_lower, error_class0_upper = prediction_bounds( n_errors_class0, n, test_size, adjusted_ci_level, prediction_method ) # Bernoulli event: Z_i^{err,1} = 1{Y_i=1, S_i=singleton, E_i=1} error_class1_lower, error_class1_upper = prediction_bounds( n_errors_class1, n, test_size, adjusted_ci_level, prediction_method ) # Conditional error rates: P(error | singleton & class) # Bernoulli event: W_i^{err|0} = 1{E_i=1} given Y_i=0, S_i=singleton # Mean: r_0^{err} = P(E=1 | Y=0, S=singleton) # k_cal: count of class-0 singleton errors in calibration # n_cal: count of class-0 singletons in calibration (conditional subpopulation) # n_test: estimated future number of class-0 singletons in test (point estimate) # # NOTE: The denominator (n_test) is random, making bounds conservative. # This is documented in the stability note in the report. expected_n_singletons_class0_test = int(test_size * (n_singletons_class0 / n)) if n > 0 else 1 expected_n_singletons_class0_test = max(expected_n_singletons_class0_test, 1) if n_singletons_class0 > 0 else 1 expected_n_singletons_class1_test = int(test_size * (n_singletons_class1 / n)) if n > 0 else 1 expected_n_singletons_class1_test = max(expected_n_singletons_class1_test, 1) if n_singletons_class1 > 0 else 1 if n_singletons_class0 > 0: error_cond_class0_lower, error_cond_class0_upper = prediction_bounds( n_errors_class0, n_singletons_class0, expected_n_singletons_class0_test, adjusted_ci_level, prediction_method, ) else: error_cond_class0_lower = 0.0 error_cond_class0_upper = 1.0 if n_singletons_class1 > 0: error_cond_class1_lower, error_cond_class1_upper = prediction_bounds( n_errors_class1, n_singletons_class1, expected_n_singletons_class1_test, adjusted_ci_level, prediction_method, ) else: error_cond_class1_lower = 0.0 error_cond_class1_upper = 1.0 return { "singleton_rate_bounds": [singleton_lower, singleton_upper], "doublet_rate_bounds": [doublet_lower, doublet_upper], "abstention_rate_bounds": [abstention_lower, abstention_upper], "singleton_rate_class0_bounds": [singleton_class0_lower, singleton_class0_upper], "singleton_rate_class1_bounds": [singleton_class1_lower, singleton_class1_upper], "doublet_rate_class0_bounds": [doublet_class0_lower, doublet_class0_upper], "doublet_rate_class1_bounds": [doublet_class1_lower, doublet_class1_upper], "abstention_rate_class0_bounds": [abstention_class0_lower, abstention_class0_upper], "abstention_rate_class1_bounds": [abstention_class1_lower, abstention_class1_upper], "singleton_error_rate_class0_bounds": [error_class0_lower, error_class0_upper], "singleton_error_rate_class1_bounds": [error_class1_lower, error_class1_upper], "singleton_error_rate_cond_class0_bounds": [error_cond_class0_lower, error_cond_class0_upper], "singleton_error_rate_cond_class1_bounds": [error_cond_class1_lower, error_cond_class1_upper], "expected_singleton_rate": singleton_rate, "expected_doublet_rate": doublet_rate, "expected_abstention_rate": abstention_rate, "expected_singleton_rate_class0": singleton_rate_class0, "expected_singleton_rate_class1": singleton_rate_class1, "expected_doublet_rate_class0": doublet_rate_class0, "expected_doublet_rate_class1": doublet_rate_class1, "expected_abstention_rate_class0": abstention_rate_class0, "expected_abstention_rate_class1": abstention_rate_class1, "expected_singleton_error_rate_class0": singleton_error_rate_class0, "expected_singleton_error_rate_class1": singleton_error_rate_class1, "expected_singleton_error_rate_cond_class0": singleton_error_rate_cond_class0, "expected_singleton_error_rate_cond_class1": singleton_error_rate_cond_class1, "n_grid_points": 1, # Single scenario (fixed thresholds) "pac_level": adjusted_ci_level, "ci_level": ci_level, "test_size": n, "use_union_bound": use_union_bound, "n_metrics": n_metrics if use_union_bound else None, }
[docs] def compute_pac_operational_bounds_marginal_loo_corrected( ssbc_result_0: SSBCResult, ssbc_result_1: SSBCResult, labels: np.ndarray, probs: np.ndarray, test_size: int, ci_level: float = 0.95, pac_level: float = 0.95, # Kept for API compatibility (not used) use_union_bound: bool = True, n_jobs: int = -1, prediction_method: str = "auto", loo_inflation_factor: float | None = None, verbose: bool = True, ) -> dict: """Compute marginal operational bounds with LOO-CV uncertainty correction. This function uses the new LOO uncertainty quantification that properly accounts for all four sources of uncertainty: 1. LOO-CV correlation structure 2. Threshold calibration uncertainty 3. Parameter estimation uncertainty 4. Test sampling uncertainty This is the RECOMMENDED function for small calibration sets (n=20-40). Parameters ---------- ssbc_result_0 : SSBCResult SSBC result for class 0 ssbc_result_1 : SSBCResult SSBC result for class 1 labels : np.ndarray True labels probs : np.ndarray Predicted probabilities test_size : int Expected test set size for prediction bounds ci_level : float, default=0.95 Confidence level for prediction bounds use_union_bound : bool, default=True Apply Bonferroni for simultaneous guarantees n_jobs : int, default=-1 Number of parallel jobs (-1 = all cores) prediction_method : str, default="auto" Method for LOO uncertainty quantification: - "auto": Automatically select best method - "analytical": Method 1 (recommended for n>=40) - "exact": Method 2 (recommended for n=20-40) - "hoeffding": Method 3 (ultra-conservative) - "all": Compare all methods loo_inflation_factor : float, optional Manual override for LOO variance inflation factor. If None, automatically estimated. Typical values: 1.0 (no inflation), 2.0 (standard LOO), 1.5-2.5 (empirical range) Returns ------- dict Operational bounds with keys: - 'singleton_rate_bounds': [L, U] - 'doublet_rate_bounds': [L, U] - 'abstention_rate_bounds': [L, U] - 'singleton_error_rate_class0_bounds': [L, U] (error when true_label=0) - 'singleton_error_rate_class1_bounds': [L, U] (error when true_label=1) - 'singleton_correct_rate_class0_bounds': [L, U] (correct when true_label=0) - 'singleton_correct_rate_class1_bounds': [L, U] (correct when true_label=1) - 'singleton_error_rate_pred_class0_bounds': [L, U] (error when predicted_class=0) - 'singleton_error_rate_pred_class1_bounds': [L, U] (error when predicted_class=1) - 'singleton_correct_rate_pred_class0_bounds': [L, U] (correct when predicted_class=0) - 'singleton_correct_rate_pred_class1_bounds': [L, U] (correct when predicted_class=1) - 'expected_*_rate': point estimates - 'loo_diagnostics': Detailed LOO uncertainty analysis Note: Marginal singleton_error_rate_bounds is NOT computed because it mixes two different distributions (class 0 and class 1) which cannot be justified statistically. Note: Conditional rates are NOT computed in the marginal section. """ n = len(labels) # Compute k (quantile position) from SSBC-corrected alpha n_0 = ssbc_result_0.n n_1 = ssbc_result_1.n k_0 = int(np.ceil((n_0 + 1) * (1 - ssbc_result_0.alpha_corrected))) k_1 = int(np.ceil((n_1 + 1) * (1 - ssbc_result_1.alpha_corrected))) # Parallel LOO-CV: evaluate each sample results = _safe_parallel_map( n_jobs, _evaluate_loo_single_sample_marginal, ((idx, labels, probs, k_0, k_1) for idx in range(n)), ) # Aggregate results results_array = np.array(results) n_singletons = int(np.sum(results_array[:, 0])) n_doublets = int(np.sum(results_array[:, 1])) n_abstentions = int(np.sum(results_array[:, 2])) # Point estimates singleton_rate = n_singletons / n doublet_rate = n_doublets / n abstention_rate = n_abstentions / n # Convert to binary LOO predictions for each rate type singleton_loo_preds = results_array[:, 0].astype(int) doublet_loo_preds = results_array[:, 1].astype(int) abstention_loo_preds = results_array[:, 2].astype(int) # Class-specific rates (normalized against full dataset) singleton_class0_loo_preds = ((results_array[:, 0] == 1) & (labels == 0)).astype(int) singleton_class1_loo_preds = ((results_array[:, 0] == 1) & (labels == 1)).astype(int) doublet_class0_loo_preds = ((results_array[:, 1] == 1) & (labels == 0)).astype(int) doublet_class1_loo_preds = ((results_array[:, 1] == 1) & (labels == 1)).astype(int) abstention_class0_loo_preds = ((results_array[:, 2] == 1) & (labels == 0)).astype(int) abstention_class1_loo_preds = ((results_array[:, 2] == 1) & (labels == 1)).astype(int) # Class-specific singleton error rates (normalized against full dataset) # Error rate for singletons with true_label=0, normalized by total samples error_class0_loo_preds = ((results_array[:, 0] == 1) & (labels == 0) & (results_array[:, 3] == 0)).astype(int) # Error rate for singletons with true_label=1, normalized by total samples error_class1_loo_preds = ((results_array[:, 0] == 1) & (labels == 1) & (results_array[:, 3] == 0)).astype(int) # Class-specific singleton correct rates (normalized against full dataset) # Correct rate for singletons with true_label=0, normalized by total samples # Bernoulli event: Z_i^{cor,0} = 1{Y_i=0, S_i=singleton, E_i=0} (LOO indicators) # Mean: θ_0^{cor} = P(Y=0, S=singleton, E=0) correct_class0_loo_preds = ((results_array[:, 0] == 1) & (labels == 0) & (results_array[:, 3] == 1)).astype(int) # Correct rate for singletons with true_label=1, normalized by total samples # Bernoulli event: Z_i^{cor,1} = 1{Y_i=1, S_i=singleton, E_i=0} (LOO indicators) # Mean: θ_1^{cor} = P(Y=1, S=singleton, E=0) correct_class1_loo_preds = ((results_array[:, 0] == 1) & (labels == 1) & (results_array[:, 3] == 1)).astype(int) # Error rates when singleton is assigned to a specific class (normalized against full dataset) # Error rate when singleton is assigned class 0, normalized by total samples # Bernoulli event: Z_i^{err,pred0} = 1{predicted_class=0, S_i=singleton, E_i=1} (LOO indicators) # Mean: θ_0^{err,pred} = P(predicted_class=0, S=singleton, E=1) error_pred_class0_loo_preds = ( (results_array[:, 0] == 1) & (results_array[:, 4] == 0) & (results_array[:, 3] == 0) ).astype(int) # Error rate when singleton is assigned class 1, normalized by total samples # Bernoulli event: Z_i^{err,pred1} = 1{predicted_class=1, S_i=singleton, E_i=1} (LOO indicators) # Mean: θ_1^{err,pred} = P(predicted_class=1, S=singleton, E=1) error_pred_class1_loo_preds = ( (results_array[:, 0] == 1) & (results_array[:, 4] == 1) & (results_array[:, 3] == 0) ).astype(int) # Correct rates when singleton is assigned to a specific class (normalized against full dataset) # Correct rate when singleton is assigned class 0, normalized by total samples # Bernoulli event: Z_i^{cor,pred0} = 1{predicted_class=0, S_i=singleton, E_i=0} (LOO indicators) # Mean: θ_0^{cor,pred} = P(predicted_class=0, S=singleton, E=0) correct_pred_class0_loo_preds = ( (results_array[:, 0] == 1) & (results_array[:, 4] == 0) & (results_array[:, 3] == 1) ).astype(int) # Correct rate when singleton is assigned class 1, normalized by total samples # Bernoulli event: Z_i^{cor,pred1} = 1{predicted_class=1, S_i=singleton, E_i=0} (LOO indicators) # Mean: θ_1^{cor,pred} = P(predicted_class=1, S=singleton, E=0) correct_pred_class1_loo_preds = ( (results_array[:, 0] == 1) & (results_array[:, 4] == 1) & (results_array[:, 3] == 1) ).astype(int) # Point estimates for class-specific rates (normalized against full dataset) singleton_rate_class0 = float(np.mean(singleton_class0_loo_preds)) if n > 0 else 0.0 singleton_rate_class1 = float(np.mean(singleton_class1_loo_preds)) if n > 0 else 0.0 doublet_rate_class0 = float(np.mean(doublet_class0_loo_preds)) if n > 0 else 0.0 doublet_rate_class1 = float(np.mean(doublet_class1_loo_preds)) if n > 0 else 0.0 abstention_rate_class0 = float(np.mean(abstention_class0_loo_preds)) if n > 0 else 0.0 abstention_rate_class1 = float(np.mean(abstention_class1_loo_preds)) if n > 0 else 0.0 singleton_error_rate_class0 = float(np.mean(error_class0_loo_preds)) if n > 0 else 0.0 singleton_error_rate_class1 = float(np.mean(error_class1_loo_preds)) if n > 0 else 0.0 singleton_correct_rate_class0 = float(np.mean(correct_class0_loo_preds)) if n > 0 else 0.0 singleton_correct_rate_class1 = float(np.mean(correct_class1_loo_preds)) if n > 0 else 0.0 singleton_error_rate_pred_class0 = float(np.mean(error_pred_class0_loo_preds)) if n > 0 else 0.0 singleton_error_rate_pred_class1 = float(np.mean(error_pred_class1_loo_preds)) if n > 0 else 0.0 singleton_correct_rate_pred_class0 = float(np.mean(correct_pred_class0_loo_preds)) if n > 0 else 0.0 singleton_correct_rate_pred_class1 = float(np.mean(correct_pred_class1_loo_preds)) if n > 0 else 0.0 # Apply union bound adjustment # Now we have 17 metrics: singleton, doublet, abstention, # singleton_class0 (normalized), singleton_class1 (normalized), # doublet_class0 (normalized), doublet_class1 (normalized), # abstention_class0 (normalized), abstention_class1 (normalized), # error_class0 (normalized), error_class1 (normalized), # correct_class0 (normalized), correct_class1 (normalized), # error_pred_class0 (normalized), error_pred_class1 (normalized), # correct_pred_class0 (normalized), correct_pred_class1 (normalized) # Note: We do NOT compute marginal singleton_error because it mixes two different # distributions (class 0 and class 1) which cannot be justified statistically. # Note: We do NOT compute conditional rates in the marginal section. n_metrics = 17 if use_union_bound: adjusted_ci_level = 1 - (1 - ci_level) / n_metrics else: adjusted_ci_level = ci_level # Compute LOO-corrected bounds for each rate type singleton_lower, singleton_upper, singleton_report = compute_robust_prediction_bounds( singleton_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) doublet_lower, doublet_upper, doublet_report = compute_robust_prediction_bounds( doublet_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) abstention_lower, abstention_upper, abstention_report = compute_robust_prediction_bounds( abstention_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) # Class-specific singleton rates (normalized against full dataset) # Bernoulli event: Z_i^{sing,0} = 1{Y_i=0, S_i=singleton} (LOO indicators) # Mean: θ_0^{sing} = P(Y=0, S=singleton) # loo_predictions: array of LOO indicators (correlated due to LOO-CV structure) # n_test: planned test size (fixed) singleton_class0_lower, singleton_class0_upper, singleton_class0_report = compute_robust_prediction_bounds( singleton_class0_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) singleton_class1_lower, singleton_class1_upper, singleton_class1_report = compute_robust_prediction_bounds( singleton_class1_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) # Class-specific doublet rates (normalized against full dataset) doublet_class0_lower, doublet_class0_upper, doublet_class0_report = compute_robust_prediction_bounds( doublet_class0_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) doublet_class1_lower, doublet_class1_upper, doublet_class1_report = compute_robust_prediction_bounds( doublet_class1_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) # Class-specific abstention rates (normalized against full dataset) abstention_class0_lower, abstention_class0_upper, abstention_class0_report = compute_robust_prediction_bounds( abstention_class0_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) abstention_class1_lower, abstention_class1_upper, abstention_class1_report = compute_robust_prediction_bounds( abstention_class1_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) # Class-specific singleton error rates (normalized against full dataset) # Bernoulli event: Z_i^{err,0} = 1{Y_i=0, S_i=singleton, E_i=1} (LOO indicators) # Mean: θ_0^{err} = P(Y=0, S=singleton, E=1) # loo_predictions: array of LOO indicators (correlated due to LOO-CV structure) # n_test: planned test size (fixed) error_class0_lower, error_class0_upper, error_class0_report = compute_robust_prediction_bounds( error_class0_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) error_class1_lower, error_class1_upper, error_class1_report = compute_robust_prediction_bounds( error_class1_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) # Class-specific singleton correct rates (normalized against full dataset) # Bernoulli event: Z_i^{cor,0} = 1{Y_i=0, S_i=singleton, E_i=0} (LOO indicators) # Mean: θ_0^{cor} = P(Y=0, S=singleton, E=0) # loo_predictions: array of LOO indicators (correlated due to LOO-CV structure) # n_test: planned test size (fixed) correct_class0_lower, correct_class0_upper, correct_class0_report = compute_robust_prediction_bounds( correct_class0_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) correct_class1_lower, correct_class1_upper, correct_class1_report = compute_robust_prediction_bounds( correct_class1_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) # Error rates when singleton is assigned to a specific class (normalized against full dataset) # Bernoulli event: Z_i^{err,pred0} = 1{predicted_class=0, S_i=singleton, E_i=1} (LOO indicators) # Mean: θ_0^{err,pred} = P(predicted_class=0, S=singleton, E=1) error_pred_class0_lower, error_pred_class0_upper, error_pred_class0_report = compute_robust_prediction_bounds( error_pred_class0_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) error_pred_class1_lower, error_pred_class1_upper, error_pred_class1_report = compute_robust_prediction_bounds( error_pred_class1_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) # Correct rates when singleton is assigned to a specific class (normalized against full dataset) # Bernoulli event: Z_i^{cor,pred0} = 1{predicted_class=0, S_i=singleton, E_i=0} (LOO indicators) # Mean: θ_0^{cor,pred} = P(predicted_class=0, S=singleton, E=0) correct_pred_class0_lower, correct_pred_class0_upper, correct_pred_class0_report = compute_robust_prediction_bounds( correct_pred_class0_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) correct_pred_class1_lower, correct_pred_class1_upper, correct_pred_class1_report = compute_robust_prediction_bounds( correct_pred_class1_loo_preds, test_size, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) return { "singleton_rate_bounds": [singleton_lower, singleton_upper], "doublet_rate_bounds": [doublet_lower, doublet_upper], "abstention_rate_bounds": [abstention_lower, abstention_upper], "singleton_rate_class0_bounds": [singleton_class0_lower, singleton_class0_upper], "singleton_rate_class1_bounds": [singleton_class1_lower, singleton_class1_upper], "doublet_rate_class0_bounds": [doublet_class0_lower, doublet_class0_upper], "doublet_rate_class1_bounds": [doublet_class1_lower, doublet_class1_upper], "abstention_rate_class0_bounds": [abstention_class0_lower, abstention_class0_upper], "abstention_rate_class1_bounds": [abstention_class1_lower, abstention_class1_upper], "singleton_error_rate_class0_bounds": [error_class0_lower, error_class0_upper], "singleton_error_rate_class1_bounds": [error_class1_lower, error_class1_upper], "singleton_correct_rate_class0_bounds": [correct_class0_lower, correct_class0_upper], "singleton_correct_rate_class1_bounds": [correct_class1_lower, correct_class1_upper], "singleton_error_rate_pred_class0_bounds": [error_pred_class0_lower, error_pred_class0_upper], "singleton_error_rate_pred_class1_bounds": [error_pred_class1_lower, error_pred_class1_upper], "singleton_correct_rate_pred_class0_bounds": [correct_pred_class0_lower, correct_pred_class0_upper], "singleton_correct_rate_pred_class1_bounds": [correct_pred_class1_lower, correct_pred_class1_upper], "expected_singleton_rate": singleton_rate, "expected_doublet_rate": doublet_rate, "expected_abstention_rate": abstention_rate, "expected_singleton_rate_class0": singleton_rate_class0, "expected_singleton_rate_class1": singleton_rate_class1, "expected_doublet_rate_class0": doublet_rate_class0, "expected_doublet_rate_class1": doublet_rate_class1, "expected_abstention_rate_class0": abstention_rate_class0, "expected_abstention_rate_class1": abstention_rate_class1, "expected_singleton_error_rate_class0": singleton_error_rate_class0, "expected_singleton_error_rate_class1": singleton_error_rate_class1, "expected_singleton_correct_rate_class0": singleton_correct_rate_class0, "expected_singleton_correct_rate_class1": singleton_correct_rate_class1, "expected_singleton_error_rate_pred_class0": singleton_error_rate_pred_class0, "expected_singleton_error_rate_pred_class1": singleton_error_rate_pred_class1, "expected_singleton_correct_rate_pred_class0": singleton_correct_rate_pred_class0, "expected_singleton_correct_rate_pred_class1": singleton_correct_rate_pred_class1, "n_grid_points": 1, # Single scenario (fixed thresholds) "pac_level": adjusted_ci_level, "ci_level": ci_level, "test_size": test_size, "use_union_bound": use_union_bound, "n_metrics": n_metrics if use_union_bound else None, "loo_diagnostics": { "singleton": singleton_report, "doublet": doublet_report, "abstention": abstention_report, "singleton_class0": singleton_class0_report, "singleton_class1": singleton_class1_report, "doublet_class0": doublet_class0_report, "doublet_class1": doublet_class1_report, "abstention_class0": abstention_class0_report, "abstention_class1": abstention_class1_report, "singleton_error_class0": error_class0_report, "singleton_error_class1": error_class1_report, "singleton_correct_class0": correct_class0_report, "singleton_correct_class1": correct_class1_report, "singleton_error_pred_class0": error_pred_class0_report, "singleton_error_pred_class1": error_pred_class1_report, "singleton_correct_pred_class0": correct_pred_class0_report, "singleton_correct_pred_class1": correct_pred_class1_report, }, }
def _evaluate_loo_single_sample_perclass( idx: int, labels: np.ndarray, probs: np.ndarray, k_0: int, k_1: int, class_label: int, ) -> tuple[int, int, int, int]: """Evaluate single LOO fold for per-class operational rates. Returns ------- tuple[int, int, int, int] (is_singleton, is_doublet, is_abstention, is_singleton_correct) """ # Only evaluate if sample is from class_label if labels[idx] != class_label: return 0, 0, 0, 0 mask_0 = labels == 0 mask_1 = labels == 1 # Compute LOO thresholds # Class 0 if mask_0[idx]: scores_0_loo = 1.0 - probs[mask_0, 0] mask_0_idx = np.where(mask_0)[0] loo_position = np.where(mask_0_idx == idx)[0][0] scores_0_loo = np.delete(scores_0_loo, loo_position) else: scores_0_loo = 1.0 - probs[mask_0, 0] sorted_0_loo = np.sort(scores_0_loo) threshold_0_loo = sorted_0_loo[min(k_0 - 1, len(sorted_0_loo) - 1)] # Class 1 if mask_1[idx]: scores_1_loo = 1.0 - probs[mask_1, 1] mask_1_idx = np.where(mask_1)[0] loo_position = np.where(mask_1_idx == idx)[0][0] scores_1_loo = np.delete(scores_1_loo, loo_position) else: scores_1_loo = 1.0 - probs[mask_1, 1] sorted_1_loo = np.sort(scores_1_loo) threshold_1_loo = sorted_1_loo[min(k_1 - 1, len(sorted_1_loo) - 1)] # Evaluate on held-out sample score_0 = 1.0 - probs[idx, 0] score_1 = 1.0 - probs[idx, 1] true_label = labels[idx] in_0 = score_0 <= threshold_0_loo in_1 = score_1 <= threshold_1_loo # Determine prediction set type if in_0 and in_1: is_singleton, is_doublet, is_abstention = 0, 1, 0 is_singleton_correct = 0 elif in_0 or in_1: is_singleton, is_doublet, is_abstention = 1, 0, 0 is_singleton_correct = 1 if (in_0 and true_label == 0) or (in_1 and true_label == 1) else 0 else: is_singleton, is_doublet, is_abstention = 0, 0, 1 is_singleton_correct = 0 return is_singleton, is_doublet, is_abstention, is_singleton_correct
[docs] def compute_pac_operational_bounds_perclass( ssbc_result_0: SSBCResult, ssbc_result_1: SSBCResult, labels: np.ndarray, probs: np.ndarray, class_label: int, test_size: int, # Now used for prediction bounds ci_level: float = 0.95, pac_level: float = 0.95, # Kept for API compatibility (not used) use_union_bound: bool = True, n_jobs: int = -1, prediction_method: str = "simple", loo_inflation_factor: float | None = None, ) -> dict: """Compute per-class operational bounds for FIXED calibration via LOO-CV. Parameters ---------- class_label : int Which class to analyze (0 or 1) loo_inflation_factor : float, optional Manual override for LOO variance inflation factor. If None, not used. Note: Per-class bounds currently use standard prediction bounds, not LOO-corrected bounds. This parameter is included for API compatibility and future use. Notes ----- The test_size is automatically adjusted based on the expected class distribution: expected_n_class_test = test_size * (n_class_cal / n_total) This ensures proper uncertainty quantification for class-specific rates. Other parameters same as marginal version. Returns ------- dict Per-class operational bounds """ # Compute k from alpha_corrected n_0 = ssbc_result_0.n n_1 = ssbc_result_1.n k_0 = int(np.ceil((n_0 + 1) * (1 - ssbc_result_0.alpha_corrected))) k_1 = int(np.ceil((n_1 + 1) * (1 - ssbc_result_1.alpha_corrected))) # Parallel LOO-CV: evaluate each sample n = len(labels) eff_jobs = _effective_n_jobs(n_jobs, n) results = _safe_parallel_map( eff_jobs, _evaluate_loo_single_sample_perclass, ((idx, labels, probs, k_0, k_1, class_label) for idx in range(n)), ) # Aggregate results (only from class_label samples) results_array = np.array(results) n_singletons = int(np.sum(results_array[:, 0] * (labels == class_label)[:, None])) n_doublets = int(np.sum(results_array[:, 1] * (labels == class_label)[:, None])) n_abstentions = int(np.sum(results_array[:, 2] * (labels == class_label)[:, None])) n_singletons_correct = int(np.sum(results_array[:, 3] * (labels == class_label)[:, None])) # Number of class_label samples in calibration n_class_cal = np.sum(labels == class_label) # Estimate expected class distribution in test set # Use calibration class distribution as estimate for test set n_total = len(labels) class_rate_cal = n_class_cal / n_total expected_n_class_test = int(test_size * class_rate_cal) # Ensure minimum test size for numerical stability expected_n_class_test = max(expected_n_class_test, 1) # Point estimates (calibration proportions) n_errors = n_singletons - n_singletons_correct # Apply prediction bounds accounting for both calibration and test set sampling uncertainty # These bound operational rates on future test sets of size expected_n_class_test # SE = sqrt(p̂(1-p̂) * (1/n_cal + 1/n_test)) accounts for both sources of uncertainty n_metrics = 4 if use_union_bound: adjusted_ci_level = 1 - (1 - ci_level) / n_metrics else: adjusted_ci_level = ci_level # Use prediction bounds instead of Clopper-Pearson for operational rates # Use expected class-specific test size for proper uncertainty quantification singleton_lower, singleton_upper = prediction_bounds( n_singletons, n_class_cal, expected_n_class_test, adjusted_ci_level, prediction_method ) doublet_lower, doublet_upper = prediction_bounds( n_doublets, n_class_cal, expected_n_class_test, adjusted_ci_level, prediction_method ) abstention_lower, abstention_upper = prediction_bounds( n_abstentions, n_class_cal, expected_n_class_test, adjusted_ci_level, prediction_method ) # Singleton error (conditioned on singletons) - use prediction bounds on error rate if n_singletons > 0: error_lower, error_upper = prediction_bounds( n_errors, n_singletons, expected_n_class_test, adjusted_ci_level, prediction_method ) else: error_lower = 0.0 error_upper = 1.0 # Build LOO prediction arrays for unbiased point estimates singleton_loo_preds = results_array[:, 0].astype(int) doublet_loo_preds = results_array[:, 1].astype(int) abstention_loo_preds = results_array[:, 2].astype(int) error_loo_preds = np.zeros(n, dtype=int) if n_singletons > 0: error_loo_preds = (results_array[:, 0] == 1) & (results_array[:, 3] == 0) return { "singleton_rate_bounds": [singleton_lower, singleton_upper], "doublet_rate_bounds": [doublet_lower, doublet_upper], "abstention_rate_bounds": [abstention_lower, abstention_upper], "singleton_error_rate_bounds": [error_lower, error_upper], # Unbiased LOO estimates (means of LOO predictions) "expected_singleton_rate": float(np.mean(singleton_loo_preds)) if n_class_cal > 0 else 0.0, "expected_doublet_rate": float(np.mean(doublet_loo_preds)) if n_class_cal > 0 else 0.0, "expected_abstention_rate": float(np.mean(abstention_loo_preds)) if n_class_cal > 0 else 0.0, "expected_singleton_error_rate": float(np.mean(error_loo_preds)) if n_singletons > 0 else 0.0, "n_grid_points": 1, "pac_level": adjusted_ci_level, "ci_level": ci_level, # Report the intended future test size parameter "test_size": expected_n_class_test, "use_union_bound": use_union_bound, "n_metrics": n_metrics if use_union_bound else None, }
[docs] def compute_pac_operational_bounds_perclass_loo_corrected( ssbc_result_0: SSBCResult, ssbc_result_1: SSBCResult, labels: np.ndarray, probs: np.ndarray, class_label: int, test_size: int, ci_level: float = 0.95, pac_level: float = 0.95, # Kept for API compatibility (not used) use_union_bound: bool = True, n_jobs: int = -1, prediction_method: str = "auto", loo_inflation_factor: float | None = None, verbose: bool = True, ) -> dict: """Compute per-class operational bounds with LOO-CV uncertainty correction. This function uses LOO uncertainty quantification for per-class bounds, enabling method comparison ("all") for individual classes. Parameters ---------- ssbc_result_0 : SSBCResult SSBC result for class 0 ssbc_result_1 : SSBCResult SSBC result for class 1 labels : np.ndarray True labels probs : np.ndarray Predicted probabilities class_label : int Which class to analyze (0 or 1) test_size : int Expected test set size for prediction bounds ci_level : float, default=0.95 Confidence level for prediction bounds use_union_bound : bool, default=True Apply Bonferroni for simultaneous guarantees n_jobs : int, default=-1 Number of parallel jobs (-1 = all cores) prediction_method : str, default="auto" Method for LOO uncertainty quantification: - "auto": Automatically select best method - "analytical": Method 1 (recommended for n>=40) - "exact": Method 2 (recommended for n=20-40) - "hoeffding": Method 3 (ultra-conservative) - "all": Compare all methods loo_inflation_factor : float, optional Manual override for LOO variance inflation factor. If None, automatically estimated. verbose : bool, default=True Print diagnostic information Returns ------- dict Per-class operational bounds with LOO diagnostics (when method="all") """ # Compute k from alpha_corrected n_0 = ssbc_result_0.n n_1 = ssbc_result_1.n k_0 = int(np.ceil((n_0 + 1) * (1 - ssbc_result_0.alpha_corrected))) k_1 = int(np.ceil((n_1 + 1) * (1 - ssbc_result_1.alpha_corrected))) # Parallel LOO-CV: evaluate each sample n = len(labels) eff_jobs = _effective_n_jobs(n_jobs, n) results = _safe_parallel_map( eff_jobs, _evaluate_loo_single_sample_perclass, ((idx, labels, probs, k_0, k_1, class_label) for idx in range(n)), ) # Aggregate results (only from class_label samples) results_array = np.array(results) class_mask = labels == class_label n_singletons = int(np.sum(results_array[class_mask, 0])) # Number of class_label samples in calibration n_class_cal = np.sum(class_mask) # Estimate expected class distribution in test set n_total = len(labels) class_rate_cal = n_class_cal / n_total expected_n_class_test = int(test_size * class_rate_cal) expected_n_class_test = max(expected_n_class_test, 1) # Point estimates (calibration proportions) # Convert to binary LOO predictions for each rate type # Restrict LOO binary arrays to class_label rows only (for unbiased per-class means) singleton_loo_preds = results_array[class_mask, 0].astype(int) doublet_loo_preds = results_array[class_mask, 1].astype(int) abstention_loo_preds = results_array[class_mask, 2].astype(int) # For singleton_error, this is a CONDITIONAL rate (errors given singletons) # We need to compute bounds on the conditional subpopulation (singletons only) # Create error_loo_preds for all class samples (for joint rate if needed) # But for conditional bounds, we'll filter to singletons only below error_loo_preds_all = np.zeros(np.sum(class_mask), dtype=int) if n_singletons > 0: # error_loo_preds_all[i] = 1 if singleton AND error, 0 otherwise (including non-singletons) error_loo_preds_all = (results_array[class_mask, 0] == 1) & (results_array[class_mask, 3] == 0) # For conditional error rate bounds, we need: # - LOO predictions filtered to singleton samples only # - Estimated future singleton count (not total class count) singleton_mask = results_array[class_mask, 0] == 1 error_loo_preds_cond = error_loo_preds_all[singleton_mask] if n_singletons > 0 else np.array([], dtype=int) # Estimate expected number of singletons in test set # Based on calibration: n_singletons / n_class_cal = singleton rate in class # Project to test: expected_n_singletons_test = expected_n_class_test * (n_singletons / n_class_cal) if n_class_cal > 0 and n_singletons > 0: singleton_rate_in_class = n_singletons / n_class_cal expected_n_singletons_test = int(expected_n_class_test * singleton_rate_in_class) expected_n_singletons_test = max(expected_n_singletons_test, 1) else: expected_n_singletons_test = 1 # Apply union bound adjustment n_metrics = 4 if use_union_bound: adjusted_ci_level = 1 - (1 - ci_level) / n_metrics else: adjusted_ci_level = ci_level # Compute LOO-corrected bounds for each rate type # Use compute_robust_prediction_bounds for consistency with marginal bounds # This supports all methods including "all", "auto", "analytical", "exact", "hoeffding", etc. singleton_lower, singleton_upper, singleton_report = compute_robust_prediction_bounds( singleton_loo_preds, expected_n_class_test, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) doublet_lower, doublet_upper, doublet_report = compute_robust_prediction_bounds( doublet_loo_preds, expected_n_class_test, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) abstention_lower, abstention_upper, abstention_report = compute_robust_prediction_bounds( abstention_loo_preds, expected_n_class_test, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) # For singleton_error_rate: This is a CONDITIONAL rate (W_i^{err|0} = 1{E_i=1} given Y_i=0, S_i=singleton) # According to the mathematical framework: # - k_cal = number of errors in conditional subpopulation (singletons with errors) # - n_cal = size of conditional subpopulation (singletons) # - n_test = estimated future conditional test size (estimated future singletons) # # We must use the conditional subpopulation (singletons only) for bounds computation if n_singletons > 0 and len(error_loo_preds_cond) > 0: error_lower, error_upper, error_report = compute_robust_prediction_bounds( error_loo_preds_cond, expected_n_singletons_test, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) else: # No singletons in calibration - cannot compute bounds error_lower = 0.0 error_upper = 1.0 error_report = {"selected_method": "no_singletons", "diagnostics": {}} # For singleton_correct_rate: This is a CONDITIONAL rate (W_i^{cor|0} = 1{E_i=0} given Y_i=0, S_i=singleton) # According to the mathematical framework: # - Bernoulli event: W_i^{cor|0} = 1{E_i=0} (only defined when Y_i=0, S_i=singleton) # This is the complement of W_i^{err|0} = 1{E_i=1} on the same conditional subpopulation # - k_cal = number of correct predictions in conditional subpopulation (singletons without errors) # - n_cal = size of conditional subpopulation (singletons) - same as error rate # - n_test = estimated future conditional test size (estimated future singletons) - same as error rate # # We compute correct rate bounds directly from the conditional subpopulation using compute_robust_prediction_bounds, # NOT by inverting error bounds. This ensures mathematical consistency with the Bernoulli event model. # The correct rate is computed from the complementary Bernoulli event on the same subpopulation. if n_singletons > 0 and len(error_loo_preds_cond) > 0: # Correct rate indicator: 1 if singleton is correct, 0 if error # This is the complement of error_loo_preds_cond on the same conditional subpopulation correct_loo_preds_cond = 1 - error_loo_preds_cond correct_lower, correct_upper, correct_report = compute_robust_prediction_bounds( correct_loo_preds_cond, expected_n_singletons_test, 1 - adjusted_ci_level, method=prediction_method, inflation_factor=loo_inflation_factor, verbose=False, ) else: # No singletons in calibration - cannot compute bounds correct_lower = 0.0 correct_upper = 1.0 correct_report = {"selected_method": "no_singletons", "diagnostics": {}} # Mathematical Framework: # All bounds are computed from single well-defined Bernoulli events. # # For per-class joint rates (e.g., P(class=0 AND singleton)): # - Bernoulli event: Z_i = 1{Y_i=class_label, S_i=pattern} # - k_cal: count of successes in calibration # - n_cal: total calibration size (fixed denominator) # - n_test: expected_n_class_test (estimated future class size) # # These are joint per-class rates with fixed denominators, which is mathematically # correct and avoids ratio estimation problems. # # For singleton_error_rate: This is a CONDITIONAL rate (W_i^{err|0} = 1{E_i=1} given Y_i=0, S_i=singleton) # - Bernoulli event: W_i^{err|0} = 1{E_i=1} (only defined when Y_i=0, S_i=singleton) # - k_cal: number of errors in conditional subpopulation (singletons with errors) # - n_cal: size of conditional subpopulation (singletons) # - n_test: estimated future conditional test size (estimated future singletons) # # According to the framework (Option B: predictive fraction for test run): # - We use the conditional subpopulation (singletons only) for bounds computation # - The denominator (n_test) is random, making bounds conservative # - This is documented in the stability note in the report # # Note: We do NOT compute conditional rates by dividing joint rates by class rates, # as this would require combining two intervals and is not mathematically valid. # Conditional rates are computed directly using the conditional subpopulation. # Expected rates from LOO predictions expected_singleton_error_rate = ( float(np.mean(error_loo_preds_cond)) if n_singletons > 0 and len(error_loo_preds_cond) > 0 else 0.0 ) expected_singleton_correct_rate = ( float(np.mean(correct_loo_preds_cond)) if n_singletons > 0 and len(correct_loo_preds_cond) > 0 else 1.0 ) # Build result dict result = { # Joint per-class rates (full sample, fixed denominator) "singleton_rate_bounds": [singleton_lower, singleton_upper], "doublet_rate_bounds": [doublet_lower, doublet_upper], "abstention_rate_bounds": [abstention_lower, abstention_upper], "singleton_error_rate_bounds": [error_lower, error_upper], # Unbiased LOO estimates (means of LOO predictions) "expected_singleton_rate": float(np.mean(singleton_loo_preds)) if n_class_cal > 0 else 0.0, "expected_doublet_rate": float(np.mean(doublet_loo_preds)) if n_class_cal > 0 else 0.0, "expected_abstention_rate": float(np.mean(abstention_loo_preds)) if n_class_cal > 0 else 0.0, # For singleton_error, compute conditional rate (errors / singletons), not joint rate # error_loo_preds_cond is already filtered to singleton samples only "expected_singleton_error_rate": expected_singleton_error_rate, # Singleton correct rate: P(correct | singleton, class) computed directly from conditional subpopulation # Bernoulli event: W_i^{cor|0} = 1{E_i=0} given Y_i=0, S_i=singleton # k_cal = number of correct predictions in conditional subpopulation (singletons without errors) # n_cal = size of conditional subpopulation (singletons) # n_test = estimated future conditional test size (estimated future singletons) "singleton_correct_rate_bounds": [correct_lower, correct_upper], "expected_singleton_correct_rate": expected_singleton_correct_rate, # Class rate information (for reference) "class_rate_calibration": class_rate_cal, "n_grid_points": 1, "pac_level": adjusted_ci_level, "ci_level": ci_level, "test_size": expected_n_class_test, "use_union_bound": use_union_bound, "n_metrics": n_metrics if use_union_bound else None, # Always include LOO diagnostics (for method reporting) "loo_diagnostics": { "singleton": singleton_report, "doublet": doublet_report, "abstention": abstention_report, "singleton_error": error_report, "singleton_correct": correct_report, }, } return result