"""Statistical utility functions for SSBC."""
from typing import Any
import numpy as np
from scipy import stats
from scipy.special import betaln, gammaln
# Exported public API
__all__ = [
"clopper_pearson_lower",
"clopper_pearson_upper",
"clopper_pearson_intervals",
"cp_interval",
"prediction_bounds",
"prediction_bounds_beta_binomial",
]
[docs]
def clopper_pearson_lower(k: int, n: int, confidence: float = 0.95) -> float:
"""Compute lower Clopper-Pearson (one-sided) confidence bound.
Parameters
----------
k : int
Number of successes
n : int
Total number of trials
confidence : float, default=0.95
Confidence level (e.g., 0.95 for 95% confidence)
Returns
-------
float
Lower confidence bound for the true proportion
Examples
--------
>>> lower = clopper_pearson_lower(k=5, n=10, confidence=0.95)
>>> print(f"Lower bound: {lower:.3f}")
Notes
-----
Uses Beta distribution quantiles for exact binomial confidence bounds.
For PAC-style guarantees, you may want to use delta = 1 - confidence.
"""
if k == 0:
return 0.0
# L = Beta^{-1}(1-confidence; k, n-k+1)
# Note: Using (1-confidence) as the lower tail probability
alpha = 1 - confidence
return float(stats.beta.ppf(alpha, k, n - k + 1))
[docs]
def clopper_pearson_upper(k: int, n: int, confidence: float = 0.95) -> float:
"""Compute upper Clopper-Pearson (one-sided) confidence bound.
Parameters
----------
k : int
Number of successes
n : int
Total number of trials
confidence : float, default=0.95
Confidence level (e.g., 0.95 for 95% confidence)
Returns
-------
float
Upper confidence bound for the true proportion
Examples
--------
>>> upper = clopper_pearson_upper(k=5, n=10, confidence=0.95)
>>> print(f"Upper bound: {upper:.3f}")
Notes
-----
Uses Beta distribution quantiles for exact binomial confidence bounds.
For PAC-style guarantees, you may want to use delta = 1 - confidence.
"""
if k == n:
return 1.0
# U = Beta^{-1}(confidence; k+1, n-k)
# Note: Using confidence directly for upper tail
return float(stats.beta.ppf(confidence, k + 1, n - k))
[docs]
def clopper_pearson_intervals(labels: np.ndarray, confidence: float = 0.95) -> dict[int, dict[str, Any]]:
"""Compute Clopper-Pearson (exact binomial) confidence intervals for class prevalences.
Parameters
----------
labels : np.ndarray
Binary labels (0 or 1)
confidence : float, default=0.95
Confidence level (e.g., 0.95 for 95% CI)
Returns
-------
dict
Dictionary with keys 0 and 1, each containing:
- 'count': number of samples in this class
- 'proportion': observed proportion
- 'lower': lower bound of CI
- 'upper': upper bound of CI
Examples
--------
>>> labels = np.array([0, 0, 1, 1, 0])
>>> intervals = clopper_pearson_intervals(labels, confidence=0.95)
>>> print(intervals[0]['proportion'])
0.6
Notes
-----
The Clopper-Pearson interval is an exact binomial confidence interval
based on Beta distribution quantiles. It provides conservative coverage
guarantees.
"""
alpha = 1 - confidence
n_total = len(labels)
intervals = {}
for label in [0, 1]:
count = np.sum(labels == label)
proportion = count / n_total if n_total > 0 else float("nan")
# Clopper-Pearson uses Beta distribution quantiles
# Lower bound: Beta(count, n-count+1) at alpha/2
# Upper bound: Beta(count+1, n-count) at 1-alpha/2
if count == 0:
lower = 0.0
upper = stats.beta.ppf(1 - alpha / 2, count + 1, n_total - count)
elif count == n_total:
lower = stats.beta.ppf(alpha / 2, count, n_total - count + 1)
upper = 1.0
else:
lower = stats.beta.ppf(alpha / 2, count, n_total - count + 1)
upper = stats.beta.ppf(1 - alpha / 2, count + 1, n_total - count)
intervals[label] = {"count": count, "proportion": proportion, "lower": lower, "upper": upper}
return intervals
[docs]
def cp_interval(count: int, total: int, confidence: float = 0.95) -> dict[str, float]:
"""Compute Clopper-Pearson exact confidence interval.
Helper function for computing a single CI from count and total.
Parameters
----------
count : int
Number of successes
total : int
Total number of trials
confidence : float, default=0.95
Confidence level
Returns
-------
dict
Dictionary with keys:
- 'count': original count
- 'proportion': count/total
- 'lower': lower CI bound
- 'upper': upper CI bound
"""
alpha = 1 - confidence
count = int(count)
total = int(total)
if total <= 0:
return {
"count": count,
"proportion": float("nan"),
"lower": 0.0,
"upper": 1.0,
}
p = count / total
if count == 0:
lower = 0.0
upper = stats.beta.ppf(1 - alpha / 2, 1, total)
elif count == total:
lower = stats.beta.ppf(alpha / 2, total, 1)
upper = 1.0
else:
lower = stats.beta.ppf(alpha / 2, count, total - count + 1)
upper = stats.beta.ppf(1 - alpha / 2, count + 1, total - count)
return {"count": count, "proportion": float(p), "lower": float(lower), "upper": float(upper)}
def prediction_bounds_lower(k_cal: int, n_cal: int, n_test: int, confidence: float = 0.95) -> float:
"""Compute lower prediction bound accounting for both calibration and test set sampling uncertainty.
This function computes prediction bounds for operational rates on future test sets,
accounting for both calibration uncertainty and test set sampling variability.
Parameters
----------
k_cal : int
Number of successes in calibration data
n_cal : int
Total number of samples in calibration data
n_test : int
Expected size of future test sets
confidence : float, default=0.95
Confidence level (e.g., 0.95 for 95% prediction bounds)
Returns
-------
float
Lower prediction bound for operational rates on future test sets
Notes
-----
The prediction bounds account for both:
1. Calibration uncertainty: uncertainty in the true rate p from calibration data
2. Test set sampling uncertainty: variability when sampling n_test points from the true distribution
Mathematical formula:
SE = sqrt(p̂(1-p̂) * (1/n_cal + 1/n_test))
where p̂ = k_cal/n_cal is the estimated rate from calibration data.
For large n_test, bounds approach calibration-only bounds.
For small n_test, bounds are wider due to additional test set sampling uncertainty.
"""
if n_cal <= 0 or n_test <= 0:
return 0.0
# Estimated rate from calibration
p_hat = k_cal / n_cal
# Standard error accounting for both calibration and test set sampling
# SE = sqrt(p̂(1-p̂) * (1/n_cal + 1/n_test))
se = np.sqrt(np.clip(p_hat * (1 - p_hat) * (1 / n_cal + 1 / n_test), 0.0, None))
# Z-score for confidence level
alpha = 1 - confidence
z_score = stats.norm.ppf(alpha / 2)
# Lower prediction bound
lower_bound = p_hat + z_score * se
# Ensure bounds are in [0, 1]
return max(0.0, min(1.0, lower_bound))
def prediction_bounds_upper(k_cal: int, n_cal: int, n_test: int, confidence: float = 0.95) -> float:
"""Compute upper prediction bound accounting for both calibration and test set sampling uncertainty.
This function computes prediction bounds for operational rates on future test sets,
accounting for both calibration uncertainty and test set sampling variability.
Parameters
----------
k_cal : int
Number of successes in calibration data
n_cal : int
Total number of samples in calibration data
n_test : int
Expected size of future test sets
confidence : float, default=0.95
Confidence level (e.g., 0.95 for 95% prediction bounds)
Returns
-------
float
Upper prediction bound for operational rates on future test sets
Notes
-----
The prediction bounds account for both:
1. Calibration uncertainty: uncertainty in the true rate p from calibration data
2. Test set sampling uncertainty: variability when sampling n_test points from the true distribution
Mathematical formula:
SE = sqrt(p̂(1-p̂) * (1/n_cal + 1/n_test))
where p̂ = k_cal/n_cal is the estimated rate from calibration data.
For large n_test, bounds approach calibration-only bounds.
For small n_test, bounds are wider due to additional test set sampling uncertainty.
"""
if n_cal <= 0 or n_test <= 0:
return 1.0
# Estimated rate from calibration
p_hat = k_cal / n_cal
# Standard error accounting for both calibration and test set sampling
# SE = sqrt(p̂(1-p̂) * (1/n_cal + 1/n_test))
se = np.sqrt(np.clip(p_hat * (1 - p_hat) * (1 / n_cal + 1 / n_test), 0.0, None))
# Z-score for confidence level
alpha = 1 - confidence
z_score = stats.norm.ppf(1 - alpha / 2)
# Upper prediction bound
upper_bound = p_hat + z_score * se
# Ensure bounds are in [0, 1]
return max(0.0, min(1.0, upper_bound))
def _log_binomial_coef(n: int, k: int) -> float:
"""Compute log(n choose k) = log(n!) - log(k!) - log((n-k)!)."""
return gammaln(n + 1) - gammaln(k + 1) - gammaln(n - k + 1)
def _beta_binomial_pmf(k: int, n: int, alpha: float, beta: float) -> float:
"""Beta-binomial PMF: pmf(k; n, alpha, beta) = C(n,k) * Beta(k+alpha, n-k+beta) / Beta(alpha, beta).
Parameters
----------
k : int
Number of successes
n : int
Total number of trials
alpha : float
First Beta parameter
beta : float
Second Beta parameter
Returns
-------
float
Probability mass at k
"""
return np.exp(_log_binomial_coef(n, k) + betaln(k + alpha, n - k + beta) - betaln(alpha, beta))
[docs]
def prediction_bounds_beta_binomial(
k_cal: int,
n_cal: int,
n_test: int,
confidence: float = 0.95,
use_jeffreys: bool = False,
tail: str = "equal_tailed",
) -> tuple[float, float]:
"""Beta-Binomial predictive interval for future empirical rate.
This function computes exact prediction bounds using the Beta-Binomial distribution,
which properly accounts for both calibration uncertainty and test set sampling variability.
Parameters
----------
k_cal : int
Number of successes in calibration data
n_cal : int
Total number of samples in calibration data
n_test : int
Expected size of future test sets
confidence : float, default=0.95
Desired predictive mass (confidence level)
use_jeffreys : bool, default=False
If False, use uniform prior Beta(1,1), giving posterior Beta(k_cal+1, n_cal-k_cal+1).
If True, use Jeffreys prior Beta(1/2,1/2), giving posterior Beta(k_cal+0.5, n_cal-k_cal+0.5).
tail : str, default="equal_tailed"
Interval type:
- "equal_tailed": Invert predictive CDF (α/2 each side)
- "hpd": Shortest high posterior density predictive interval
Returns
-------
tuple[float, float]
(lower_rate, upper_rate) for operational rates on future test sets
Notes
-----
This method models:
1. True rate p ~ Beta(alpha, beta) (posterior from calibration data)
2. Future successes k_test | p ~ Binomial(n_test, p)
3. Marginal predictive distribution: k_test ~ BetaBinomial(n_test, alpha, beta)
4. Return bounds on the rate k_test / n_test
This provides exact finite-sample prediction bounds that account for both sources
of uncertainty without normal approximations.
"""
if n_cal <= 0 or n_test <= 0:
return (0.0, 1.0)
offset = 0.5 if use_jeffreys else 1.0
alpha = k_cal + offset
beta = (n_cal - k_cal) + offset
# Predictive PMF over k_test = 0..n_test
ks = np.arange(n_test + 1)
pmf = np.array([_beta_binomial_pmf(k, n_test, alpha, beta) for k in ks])
pmf = pmf / pmf.sum() # Numerical safety (normalize to ensure sum = 1)
if tail == "equal_tailed":
cdf = np.cumsum(pmf)
alpha_tail = (1.0 - confidence) / 2.0
# Lower index = smallest k with CDF >= alpha_tail
k_lo = int(np.searchsorted(cdf, alpha_tail))
# Upper index = largest k with CDF <= 1 - alpha_tail
k_hi = int(np.searchsorted(cdf, 1.0 - alpha_tail, side="right") - 1)
elif tail == "hpd":
# Sort ks by PMF descending, then add mass until >= confidence,
# then take min/max k in that included set. This gives a highest-density
# predictive set, then we report its hull.
order = np.argsort(-pmf)
keep = []
mass = 0.0
for idx in order:
keep.append(idx)
mass += pmf[idx]
if mass >= confidence:
break
k_lo = ks[min(keep)]
k_hi = ks[max(keep)]
else:
raise ValueError("tail must be 'equal_tailed' or 'hpd'")
return (k_lo / n_test, k_hi / n_test)
[docs]
def prediction_bounds(
k_cal: int, n_cal: int, n_test: int, confidence: float = 0.95, method: str = "simple"
) -> tuple[float, float]:
"""Compute prediction bounds accounting for both calibration and test set sampling uncertainty.
This function provides two methods for computing prediction bounds:
1. "simple": Uses standard error formula (faster, good for large samples)
2. "beta_binomial": Uses Beta-Binomial distribution (more accurate for small samples)
Parameters
----------
k_cal : int
Number of successes in calibration data for a **single well-defined Bernoulli event**.
Must be the count of a binary indicator (e.g., Z_i = 1{event}) across all n_cal trials.
n_cal : int
Total number of trials in calibration data for the **same Bernoulli event**.
This is the fixed denominator (total sample size or conditional subpopulation size).
n_test : int
Expected number of future trials for the **same Bernoulli event**.
For joint rates, this is the planned test size (fixed).
For conditional rates, this is an estimated future conditional subpopulation size.
confidence : float, default=0.95
Confidence level (e.g., 0.95 for 95% prediction bounds)
method : str, default="simple"
Method to use: "simple" or "beta_binomial"
Returns
-------
tuple[float, float]
(lower_bound, upper_bound) for operational rates on future test sets
Examples
--------
>>> # Simple method (default)
>>> lower, upper = prediction_bounds(k_cal=50, n_cal=100, n_test=1000, confidence=0.95)
>>> print(f"Simple bounds: [{lower:.3f}, {upper:.3f}]")
>>> # Beta-Binomial method (more accurate for small samples)
>>> lower, upper = prediction_bounds(k_cal=50, n_cal=100, n_test=1000, confidence=0.95, method="beta_binomial")
>>> print(f"Beta-Binomial bounds: [{lower:.3f}, {upper:.3f}]")
Notes
-----
The prediction bounds account for both:
1. Calibration uncertainty: uncertainty in the true rate p from calibration data
2. Test set sampling uncertainty: variability when sampling n_test points from the true distribution
**Simple method** (default):
- Mathematical formula: SE = sqrt(p̂(1-p̂) * (1/n_cal + 1/n_test))
- Good for large sample sizes
- Faster computation
**Beta-Binomial method**:
- Uses Beta-Binomial distribution for exact finite-sample modeling
- More accurate for small sample sizes
- Slower computation
- Uses uniform prior Beta(1,1) and equal-tailed intervals by default
- For advanced use (Jeffreys prior or HPD intervals), call
prediction_bounds_beta_binomial() directly
For large n_test, bounds approach calibration-only bounds.
For small n_test, bounds are wider due to additional test set sampling uncertainty.
This is the recommended function for computing operational rate bounds when
applying fixed thresholds to future test sets.
"""
if method == "simple":
raw_lower = prediction_bounds_lower(k_cal, n_cal, n_test, confidence)
raw_upper = prediction_bounds_upper(k_cal, n_cal, n_test, confidence)
# Ensure monotonicity (clip in case of numerical noise)
lower = min(raw_lower, raw_upper)
upper = max(raw_lower, raw_upper)
return (lower, upper)
elif method == "beta_binomial":
return prediction_bounds_beta_binomial(k_cal, n_cal, n_test, confidence)
else:
raise ValueError(f"Unknown method: {method}. Use 'simple' or 'beta_binomial'.")
[docs]
def ensure_ci(d: dict[str, Any] | Any, count: int, total: int, confidence: float = 0.95) -> tuple[float, float, float]:
"""Extract or compute rate and confidence interval from a dictionary.
If the dictionary already contains rate/CI information, use it.
Otherwise, compute Clopper-Pearson CI from count/total.
This function re-normalizes to the requested confidence level if the
provided dictionary is missing bounds or if the provided bounds look
degenerate (NaN values).
Parameters
----------
d : dict
Dictionary that may contain 'rate'/'proportion' and 'lower'/'upper'
count : int
Count for CI computation (if needed)
total : int
Total for CI computation (if needed)
confidence : float, default=0.95
Confidence level
Returns
-------
tuple of (rate, lower, upper)
Rate and confidence interval bounds
"""
# Initialize with NaN to detect missing values
r = np.nan
lo = np.nan
hi = np.nan
if isinstance(d, dict):
if "rate" in d:
r = float(d["rate"])
elif "proportion" in d:
r = float(d["proportion"])
if "ci_95" in d and isinstance(d["ci_95"], tuple | list) and len(d["ci_95"]) == 2:
lo, hi = map(float, d["ci_95"])
else:
lo = float(d.get("lower", np.nan))
hi = float(d.get("upper", np.nan))
# If missing or invalid (NaN), compute CP interval
need_ci = np.isnan(r) or np.isnan(lo) or np.isnan(hi)
if need_ci:
ci = cp_interval(count, total, confidence)
return ci["proportion"], ci["lower"], ci["upper"]
return r, lo, hi