Source code for dualbounds.bootstrap

import numpy as np
import pandas as pd
from scipy import stats
from .utilities import vrange, cluster_bootstrap_se, itemize
from .generic import DualBounds
from .delta import DeltaDualBounds

[docs] def multiplier_bootstrap( samples: np.array, alpha: float, B: int=1000, maxarrsize: int=int(1e10), param: str='max', verbose: int=False, ): """ Computes multiplier bootstrap lower confidence bounds. Precisely, computes a lower confidence bound on :math:`\max(\mu_1, \dots, \mu_d)`, where :math:`\mu_i` is the mean of ``samples[i]``. Parameters ---------- samples : np.array (n,d)-shaped array where samples[i] is i.i.d. with mean :math:`\mu_i`. alpha : float Nominal error control level. B : int Number of bootstrap replications maxarrsize : float Maximum size of an array; used to save memory. param : str - If param='max', computes a lower confidence bound on :math:`\max(\mu_1, \dots, \mu_d)`. - Else, computes an upper confidence bound on :math:`\min(\mu_1, \dots, \mu_d)`, verbose : bool If True, shows a progress bar. Only useful if ``samples`` is a very large matrix. Returns ------- estimate : float Estimate of :math:`\max(\mu_1, \dots, \mu_d)`. ci : float Lower confidence bound on :math:`\max(\mu_1, \dots, \mu_d)`. """ if param != 'max': estimate, ci = multiplier_bootstrap( samples=-1 * samples, alpha=alpha, B=B, maxarrsize=maxarrsize, param='max', ) return -estimate, -ci hatmu = samples.mean(axis=0) hatsigma = samples.std(axis=0) # This is important: if hatsigma \approx 0, # treat it as zero to ensure numerical stability when dividing. # Otherwise this should have zero effect. hatsigma = np.around(hatsigma, 10) if np.all(hatsigma == 0): return np.max(hatmu), np.max(hatmu) if np.any(hatsigma == 0): # use noise-free columns as a deterministic lower bound min_val = np.max(hatmu[hatsigma == 0]) samples = samples[:, hatsigma != 0] hatmu = hatmu[hatsigma != 0] hatsigma = hatsigma[hatsigma != 0] else: min_val = - np.inf # Centered statistics Tbs = [] # Determine batch size n, d = samples.shape batchsize = min(B, max(1, int(maxarrsize / (n * d)))) n_batches = int(np.ceil(B / batchsize)) # Loop and compute bootstrap for b in vrange(n_batches, verbose=verbose): W = np.random.randn(n, batchsize, 1) sw = np.sum( W * (samples - hatmu).reshape(n, 1, d), axis=0 ) / np.sqrt(n) Tbs.append(np.max(sw / hatsigma, axis=1)) Tbs = np.concatenate(Tbs) # Compute quantile and upper CI quantile = np.quantile(Tbs, 1-alpha) estimate = max(np.max(hatmu), min_val) ci = max( np.max(hatmu - quantile * hatsigma / np.sqrt(n)), min_val ) return estimate, ci
def dualbound_cluster_bootstrap( db_objects: list[DualBounds], aipw: bool=True, alpha: float=0.05, B: int=1000, ): """ Combines evidence across multiple DualBounds classes using a (clustered) bootstrap. Parameters ---------- db_objects : list A list of fit DualBounds classes. aipw : bool If True, uses AIPW estimators to reduce variance (highly recommended). alpha : float Nominal level, between 0 and 1. B : int Number of bootstrap replications. """ # Learn dimensionality K = len(db_objects) n = db_objects[0].aipw_summands.shape[1] # Original estimates and SEs estimates = np.stack([x.estimates for x in db_objects], axis=-1) # 2 X K ses = np.stack([x.ses for x in db_objects], axis=-1) # 2 X K # Combine all of the summands appropriately if aipw: summands = np.stack([x.aipw_summands for x in db_objects], axis=-1) # 2 X n x K else: summands = np.stack([x.ipw_summands for x in db_objects], axis=-1) if isinstance(db_objects[0], DeltaDualBounds): z0summands = np.stack([x.z0summands for x in db_objects], axis=-1) # n x d0 x K z1summands = np.stack([x.z1summands for x in db_objects], axis=-1) # n x d1 x K # Extract functions h = db_objects[0].h else: z0summands = np.zeros((n, 1, K)) z1summands = np.zeros((n, 1, K)) # Dummy versions of the DeltaDualBounds functions h = lambda fval, z0, z1: fval d0 = z0summands.shape[1] def compute_estimate(data): hatmu = data.mean(axis=0) # (1 + d0 + d1) x K return np.array([ itemize(h(fval=hatmu[0, k], z0=hatmu[1:(d0+1), k], z1=hatmu[(d0+1):, k])) for k in range(K) ]) # Bootstrap combined_estimates = np.zeros(2) cis = np.zeros(2) #abe = np.zeros((2, B, K)) #cbe = np.zeros((2, B, K)) for lower in [0, 1]: samples = np.concatenate( [summands[1-lower].reshape(n, 1, K), z0summands, z1summands], axis=1 ) # n x (1 + d0 + d1) x K # Bootstrapped estimators _, bs_estimators = cluster_bootstrap_se( data=samples, clusters=db_objects[0].clusters, func=compute_estimate, B=B, verbose=False, ) # B x K # Centerd and standardized variant centered = (bs_estimators - estimates[1-lower]) / ses[1-lower] # Compute final estimates and bounds if lower == 1: hatq = np.quantile(centered.max(axis=-1), 1-alpha/2) combined_estimates[1-lower] = np.max(estimates[1-lower]) cis[1-lower] = np.max(estimates[1-lower] - hatq * ses[1-lower]) else: hatq = np.quantile(centered.min(axis=-1), alpha/2) combined_estimates[1-lower] = np.min(estimates[1-lower]) cis[1-lower] = np.min(estimates[1-lower] - hatq * ses[1-lower]) # Return return pd.DataFrame( np.stack([combined_estimates, cis], axis=0), index=['Estimate', 'Conf. Int.'], columns=['Lower', 'Upper'], )
[docs] def dualbound_multiplier_bootstrap( db_objects: list[DualBounds], aipw: bool=True, alpha: float=0.05, **kwargs, ) -> pd.DataFrame: """ Combines evidence across multiple DualBounds classes using the multiplier bootstrap. Parameters ---------- db_objects : list A list of fit DualBounds classes. aipw : bool If True, uses AIPW estimators to reduce variance (highly recommended). alpha : float Nominal level, between 0 and 1. kwargs : dict kwargs for dualbounds.bootstrap.multiplier_bootstrap. Returns ------- result : pd.DataFrame dataframe of results. """ # Fetch summands if aipw: summands = [x.aipw_summands for x in db_objects] else: summands = [x.ipw_summands for x in db_objects] # Separate lower/upper summands lower_summands = np.stack([x[0, :] for x in summands], axis=1) upper_summands = np.stack([x[1, :] for x in summands], axis=1) # Compute lower/upper CI lower_est, lower_ci = multiplier_bootstrap( samples=lower_summands, param='max', alpha=alpha/2, **kwargs ) upper_est, upper_ci = multiplier_bootstrap( samples=upper_summands, param='min', alpha=alpha/2, **kwargs ) # Return estimates = np.array([lower_est, upper_est]) cis = np.array([lower_ci, upper_ci]) return pd.DataFrame( np.stack([estimates, cis], axis=0), index=['Estimate', 'Conf. Int.'], columns=['Lower', 'Upper'], )