Mercurial > hg > cc > cirrus_work
changeset 194:ca937be30ad7
implement alternative confidence measure using stats.bootstrap,
messier than the arctanh method
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 04 Dec 2023 09:35:53 +0000 |
parents | 2839f767c82b |
children | f850ae9a981e |
files | lib/python/cc/ccstats.py |
diffstat | 1 files changed, 73 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/ccstats.py Mon Dec 04 09:33:13 2023 +0000 +++ b/lib/python/cc/ccstats.py Mon Dec 04 09:35:53 2023 +0000 @@ -18,14 +18,21 @@ import numpy as np from numpy import loadtxt +import scipy from scipy import stats import statsmodels.api as sm import matplotlib.pyplot as plt import pylab +from scipy.stats import _bootstrap import sys, math, os.path, random -SS = {} +RNG = np.random.default_rng() +try: + SS +except NameError: + SS = {} + class CCStats: def __init__(self, year_month, name, file_base): global SS @@ -121,26 +128,43 @@ 'Rank self.correlation of each segment self.x whole crawl %s'%self.id, align) - def plot_ci(self,rhos=None,n=None,trim=None,conf=0.95): - # rhos are (rank) correlation values - if rhos is None: - rhos = self.all - if n is None: - n = rhos.shape[0] - rhos_s=rhos[(-rhos).argsort()] - if trim is None: - l=len(rhos) - else: - rhos_s=rhos_s[:trim] - l=trim - cc=(np.array([ci(r,n,conf) for r in rhos_s])).T - ue=cc[1]-rhos_s - le=rhos_s-cc[0] - #for i in range(len(rhos)): - #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) - plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') - plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(self.id,conf*100)) - plt.show() + def plot_ci(self,rhos=None,bootstrap=None,n=None,trim=None,conf=0.95,block=True): + global cc + # rhos are (rank) correlation values + # If bootstrap is present, it should be the data from which rhos + # was derived via spearmanr, which will be used for bootstrapping, e.g. self.counts + if rhos is None: + rhos = self.all + if n is None: + n = rhos.shape[0] + all_order=(-rhos).argsort() + rhos_s=rhos[all_order] + if trim is None: + l=len(rhos) + else: + rhos_s=rhos_s[:trim] + l=trim + if bootstrap is not None: + all=bootstrap[:,0] # [all_order] + rest=bootstrap[:,1:] # [all_order] + cc=np.array([tuple(stats.bootstrap((all, + rest[:,i]), + sr, vectorized=bool(print(i)), + paired=True, method='bca', + random_state=RNG).confidence_interval) + for i in range(l)]) + #cc=cc[(-rhos).argsort()].T + cc=cc.T + cc=[cc[0][all_order],cc[1][all_order]] + else: + cc=np.array([ci(r,n,conf) for r in rhos_s]).T + ue=cc[1]-rhos_s + le=rhos_s-cc[0] + #for i in range(len(rhos)): + #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) + plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') + plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(self.id,conf*100)) + plt.show(block=block) def ranks(self): # Combine segment measures: @@ -193,6 +217,28 @@ upper=math.tanh(math.atanh(rho)+delta) return (lower,upper) +def sr(x,y): + return stats.spearmanr(x,y)[0] + +def _percentile_along_axis(theta_hat_b, alpha): + """`np.percentile` with different percentile for each slice.""" + # the difference between _percentile_along_axis and np.percentile is that + # np.percentile gets _all_ the qs for each axis slice, whereas + # _percentile_along_axis gets the q corresponding with each axis slice + shape = theta_hat_b.shape[:-1] + alpha = np.broadcast_to(alpha, shape) + percentiles = np.zeros_like(alpha, dtype=np.float64) + for indices, alpha_i in np.ndenumerate(alpha): + if np.isnan(alpha_i): + # See https://github.com/scipy/scipy/pull/14351/commits/e5f38dee638d63981e89b4630e2ea5bc3a59a51d + percentiles[indices] = np.nan + else: + theta_hat_b_i = theta_hat_b[indices] + percentiles[indices] = np.percentile(theta_hat_b_i, alpha_i) + return percentiles[()] # return scalar instead of 0d array + +_bootstrap._percentile_along_axis = _percentile_along_axis + def explore_deltas(basis_ym, basis_names, target_ym, target_names, nn): for n in nn: for p in basis_names: @@ -275,6 +321,12 @@ 4), target.all_s.minmax[0])) +def dump(file=sys.stdout): + for s in SS.values(): + print("CCStats(%s,%s,%s)"%(repr(s.year_month),repr(s.name), + repr(s.file.split('/',7)[7][:-4])), + file=sys.stdout) + if __name__ == '__main__': S = CCStats(*sys.argv[1:])