Source code for dualbounds.bootstrap

import numpy as np
import pandas as pd
from scipy import stats
from .utilities import vrange, cluster_bootstrap_se, itemize
from .generic import DualBounds
from .delta import DeltaDualBounds


[docs]
def multiplier_bootstrap(
	samples: np.array, 
	alpha: float,
	B: int=1000, 
	maxarrsize: int=int(1e10),
	param: str='max',
	verbose: int=False,
):
	"""
	Computes multiplier bootstrap lower confidence bounds. 

	Precisely, computes a lower confidence bound on
	:math:`\max(\mu_1, \dots, \mu_d)`,
	where :math:`\mu_i` is the mean of ``samples[i]``.

	Parameters
	----------
	samples : np.array
		(n,d)-shaped array where samples[i]
		is i.i.d. with mean :math:`\mu_i`.
	alpha : float
		Nominal error control level.
	B : int
		Number of bootstrap replications
	maxarrsize : float
		Maximum size of an array; used to save memory.
	param : str
		
		- If param='max', computes a lower confidence bound on :math:`\max(\mu_1, \dots, \mu_d)`.
		- Else, computes an upper confidence bound on :math:`\min(\mu_1, \dots, \mu_d)`,

	verbose : bool
		If True, shows a progress bar. Only useful
		if ``samples`` is a very large matrix.

	Returns
	-------
	estimate : float
		Estimate of :math:`\max(\mu_1, \dots, \mu_d)`.
	ci : float
		Lower confidence bound on :math:`\max(\mu_1, \dots, \mu_d)`.
	"""
	if param != 'max':
		estimate, ci = multiplier_bootstrap(
			samples=-1 * samples, 
			alpha=alpha, 
			B=B, 
			maxarrsize=maxarrsize,
			param='max',
		)
		return -estimate, -ci

	hatmu = samples.mean(axis=0)
	hatsigma = samples.std(axis=0)
	# This is important: if hatsigma \approx 0,
	# treat it as zero to ensure numerical stability when dividing.
	# Otherwise this should have zero effect.
	hatsigma = np.around(hatsigma, 10) 
	if np.all(hatsigma == 0):
		return np.max(hatmu), np.max(hatmu)
	if np.any(hatsigma == 0):
		# use noise-free columns as a deterministic lower bound
		min_val = np.max(hatmu[hatsigma == 0])
		samples = samples[:, hatsigma != 0]
		hatmu = hatmu[hatsigma != 0]
		hatsigma = hatsigma[hatsigma != 0]
	else:
		min_val = - np.inf

	# Centered statistics
	Tbs = []

	# Determine batch size
	n, d = samples.shape
	batchsize = min(B, max(1, int(maxarrsize / (n * d))))
	n_batches = int(np.ceil(B / batchsize))
	# Loop and compute bootstrap
	for b in vrange(n_batches, verbose=verbose):
		W = np.random.randn(n, batchsize, 1)
		sw = np.sum(
			W * (samples - hatmu).reshape(n, 1, d),
			axis=0
		) / np.sqrt(n)
		Tbs.append(np.max(sw / hatsigma, axis=1))
	Tbs = np.concatenate(Tbs)
	# Compute quantile and upper CI
	quantile = np.quantile(Tbs, 1-alpha)
	estimate = max(np.max(hatmu), min_val)
	ci = max(
		np.max(hatmu - quantile * hatsigma / np.sqrt(n)), 
		min_val
	)
	return estimate, ci


def dualbound_cluster_bootstrap(
	db_objects: list[DualBounds],
	aipw: bool=True,
	alpha: float=0.05,
	B: int=1000,
):
	"""
	Combines evidence across multiple DualBounds classes
	using a (clustered) bootstrap.

	Parameters
	----------
	db_objects : list
		A list of fit DualBounds classes.
	aipw : bool
		If True, uses AIPW estimators to reduce variance
		(highly recommended).
	alpha : float
		Nominal level, between 0 and 1.
	B : int
		Number of bootstrap replications.
	"""
	# Learn dimensionality
	K = len(db_objects)
	n = db_objects[0].aipw_summands.shape[1]
	# Original estimates and SEs
	estimates = np.stack([x.estimates for x in db_objects], axis=-1) # 2 X K
	ses = np.stack([x.ses for x in db_objects], axis=-1) # 2 X K
	# Combine all of the summands appropriately
	if aipw:
		summands = np.stack([x.aipw_summands for x in db_objects], axis=-1) # 2 X n x K
	else:
		summands = np.stack([x.ipw_summands for x in db_objects], axis=-1)
	if isinstance(db_objects[0], DeltaDualBounds):
		z0summands = np.stack([x.z0summands for x in db_objects], axis=-1) # n x d0 x K
		z1summands = np.stack([x.z1summands for x in db_objects], axis=-1) # n x d1 x K
		# Extract functions
		h = db_objects[0].h
	else:
		z0summands = np.zeros((n, 1, K))
		z1summands = np.zeros((n, 1, K))
		# Dummy versions of the DeltaDualBounds functions
		h = lambda fval, z0, z1: fval

	d0 = z0summands.shape[1]
	def compute_estimate(data):
		hatmu = data.mean(axis=0) # (1 + d0 + d1) x K
		return np.array([
			itemize(h(fval=hatmu[0, k], z0=hatmu[1:(d0+1), k], z1=hatmu[(d0+1):, k]))
			for k in range(K)
		])
		
	# Bootstrap
	combined_estimates = np.zeros(2)
	cis = np.zeros(2)
	#abe = np.zeros((2, B, K))
	#cbe = np.zeros((2, B, K))
	for lower in [0, 1]:
		samples = np.concatenate(
			[summands[1-lower].reshape(n, 1, K), z0summands, z1summands], axis=1
		) # n x (1 + d0 + d1) x K
		# Bootstrapped estimators
		_, bs_estimators = cluster_bootstrap_se(
			data=samples, 
			clusters=db_objects[0].clusters,
			func=compute_estimate,
			B=B,
			verbose=False,
		) # B x K
		# Centerd and standardized variant
		centered = (bs_estimators - estimates[1-lower]) / ses[1-lower]
		# Compute final estimates and bounds
		if lower == 1:
			hatq = np.quantile(centered.max(axis=-1), 1-alpha/2)
			combined_estimates[1-lower] = np.max(estimates[1-lower])
			cis[1-lower] = np.max(estimates[1-lower] - hatq * ses[1-lower])
		else:
			hatq = np.quantile(centered.min(axis=-1), alpha/2)
			combined_estimates[1-lower] = np.min(estimates[1-lower])
			cis[1-lower] = np.min(estimates[1-lower] - hatq * ses[1-lower])
	# Return
	return pd.DataFrame(
		np.stack([combined_estimates, cis], axis=0),
		index=['Estimate', 'Conf. Int.'],
		columns=['Lower', 'Upper'],
	)




[docs]
def dualbound_multiplier_bootstrap(
	db_objects: list[DualBounds], 
	aipw: bool=True,
	alpha: float=0.05,
	**kwargs,
) -> pd.DataFrame:
	"""
	Combines evidence across multiple DualBounds classes
	using the multiplier bootstrap.

	Parameters
	----------
	db_objects : list
		A list of fit DualBounds classes.
	aipw : bool
		If True, uses AIPW estimators to reduce variance
		(highly recommended).
	alpha : float
		Nominal level, between 0 and 1.
	kwargs : dict
		kwargs for dualbounds.bootstrap.multiplier_bootstrap.
	Returns
	-------
	result : pd.DataFrame
		dataframe of results.
	"""
	# Fetch summands
	if aipw:
		summands = [x.aipw_summands for x in db_objects]
	else:
		summands = [x.ipw_summands for x in db_objects]

	# Separate lower/upper summands
	lower_summands = np.stack([x[0, :] for x in summands], axis=1)
	upper_summands = np.stack([x[1, :] for x in summands], axis=1)

	# Compute lower/upper CI
	lower_est, lower_ci = multiplier_bootstrap(
		samples=lower_summands,
		param='max',
		alpha=alpha/2,
		**kwargs
	)
	upper_est, upper_ci = multiplier_bootstrap(
		samples=upper_summands,
		param='min',
		alpha=alpha/2,
		**kwargs
	)
	# Return
	estimates = np.array([lower_est, upper_est])
	cis = np.array([lower_ci, upper_ci])
	return pd.DataFrame(
		np.stack([estimates, cis], axis=0),
		index=['Estimate', 'Conf. Int.'],
		columns=['Lower', 'Upper'],
	)