changeset 194:ca937be30ad7

implement alternative confidence measure using stats.bootstrap, messier than the arctanh method
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 04 Dec 2023 09:35:53 +0000
parents 2839f767c82b
children f850ae9a981e
files lib/python/cc/ccstats.py
diffstat 1 files changed, 73 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/ccstats.py	Mon Dec 04 09:33:13 2023 +0000
+++ b/lib/python/cc/ccstats.py	Mon Dec 04 09:35:53 2023 +0000
@@ -18,14 +18,21 @@
      
 import numpy as np
 from numpy import loadtxt
+import scipy
 from scipy import stats
 import statsmodels.api as sm
 import matplotlib.pyplot as plt
 import pylab
+from scipy.stats import _bootstrap
 
 import sys, math, os.path, random
 
-SS = {}
+RNG = np.random.default_rng()
+try:
+  SS
+except NameError:
+  SS = {}
+
 class CCStats:
   def __init__(self, year_month, name, file_base):
     global SS
@@ -121,26 +128,43 @@
          'Rank self.correlation of each segment self.x whole crawl %s'%self.id,
          align)
 
-  def plot_ci(self,rhos=None,n=None,trim=None,conf=0.95):
-     # rhos are (rank) correlation values
-     if rhos is None:
-       rhos = self.all
-     if n is None:
-       n = rhos.shape[0]
-     rhos_s=rhos[(-rhos).argsort()]
-     if trim is None:
-       l=len(rhos)
-     else:
-       rhos_s=rhos_s[:trim]
-       l=trim
-     cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
-     ue=cc[1]-rhos_s
-     le=rhos_s-cc[0]
-     #for i in range(len(rhos)):
-       #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
-     plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
-     plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(self.id,conf*100))
-     plt.show()
+  def plot_ci(self,rhos=None,bootstrap=None,n=None,trim=None,conf=0.95,block=True):
+    global cc
+    # rhos are (rank) correlation values
+    # If bootstrap is present, it should be the data from which rhos
+    #  was derived via spearmanr, which will be used for bootstrapping, e.g. self.counts
+    if rhos is None:
+      rhos = self.all
+    if n is None:
+      n = rhos.shape[0]
+    all_order=(-rhos).argsort()
+    rhos_s=rhos[all_order]
+    if trim is None:
+      l=len(rhos)
+    else:
+      rhos_s=rhos_s[:trim]
+      l=trim
+    if bootstrap is not None:
+      all=bootstrap[:,0] # [all_order]
+      rest=bootstrap[:,1:] # [all_order]
+      cc=np.array([tuple(stats.bootstrap((all,
+                                          rest[:,i]),
+                                          sr, vectorized=bool(print(i)),
+                                          paired=True, method='bca',
+                                          random_state=RNG).confidence_interval)
+                    for i in range(l)])
+      #cc=cc[(-rhos).argsort()].T
+      cc=cc.T
+      cc=[cc[0][all_order],cc[1][all_order]]
+    else:
+      cc=np.array([ci(r,n,conf) for r in rhos_s]).T
+    ue=cc[1]-rhos_s
+    le=rhos_s-cc[0]
+    #for i in range(len(rhos)):
+      #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
+    plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
+    plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(self.id,conf*100))
+    plt.show(block=block)
 
   def ranks(self):
     # Combine segment measures:
@@ -193,6 +217,28 @@
   upper=math.tanh(math.atanh(rho)+delta)
   return (lower,upper)
 
+def sr(x,y):
+  return stats.spearmanr(x,y)[0]
+
+def _percentile_along_axis(theta_hat_b, alpha):
+    """`np.percentile` with different percentile for each slice."""
+    # the difference between _percentile_along_axis and np.percentile is that
+    # np.percentile gets _all_ the qs for each axis slice, whereas
+    # _percentile_along_axis gets the q corresponding with each axis slice
+    shape = theta_hat_b.shape[:-1]
+    alpha = np.broadcast_to(alpha, shape)
+    percentiles = np.zeros_like(alpha, dtype=np.float64)
+    for indices, alpha_i in np.ndenumerate(alpha):
+        if np.isnan(alpha_i):
+            # See https://github.com/scipy/scipy/pull/14351/commits/e5f38dee638d63981e89b4630e2ea5bc3a59a51d
+            percentiles[indices] = np.nan
+        else:
+            theta_hat_b_i = theta_hat_b[indices]
+            percentiles[indices] = np.percentile(theta_hat_b_i, alpha_i)
+    return percentiles[()]  # return scalar instead of 0d array
+
+_bootstrap._percentile_along_axis = _percentile_along_axis
+
 def explore_deltas(basis_ym, basis_names, target_ym, target_names, nn):
   for n in nn:
     for p in basis_names:
@@ -275,6 +321,12 @@
                                                            4),
                                                      target.all_s.minmax[0]))
 
+def dump(file=sys.stdout):
+  for s in SS.values():
+    print("CCStats(%s,%s,%s)"%(repr(s.year_month),repr(s.name),
+                               repr(s.file.split('/',7)[7][:-4])),
+          file=sys.stdout)
+
 if __name__ == '__main__':
   S = CCStats(*sys.argv[1:])