changeset 34:2e8002a64f72

compute and graph confidence intervals
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 23 Nov 2022 11:05:45 +0000
parents 317bf47b506c
children f76656fa98f7
files bin/spearman.py
diffstat 1 files changed, 31 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/bin/spearman.py	Tue Nov 22 19:13:25 2022 +0000
+++ b/bin/spearman.py	Wed Nov 23 11:05:45 2022 +0000
@@ -23,7 +23,7 @@
 import matplotlib.pyplot as plt
 import pylab
 
-import sys
+import sys, math
 
 def qqa():
   # q-q plot for the whole crawl
@@ -46,7 +46,9 @@
   plt.show()
 
 def plot_x(sort=False,block=True):
-  # Make these two subplots...
+  # Make these two subplots, w. and w/o sorting
+  # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
+  #  for legend hacking
   if sort:
     aso=np.argsort(-all)
     plot_all=all[aso]
@@ -59,7 +61,7 @@
   plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
   plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
   plt.axis([0,N-1,0.8,1.0])
-  plt.legend(loc='best')
+  plt.legend(loc='best',fontsize='small')
   plt.grid(True)
   plt.show(block=block)
 
@@ -85,6 +87,32 @@
   sdax.set_xticklabels([str(s) for s,v in sdd])
   plt.show()
 
+def ci(rho,n,conf=0.95):
+  # Courtesy of https://stats.stackexchange.com/a/18904
+  # rho is (rank) correlation, n is sample size
+  stderr=1.0/math.sqrt(n-3)
+  z=stats.norm.ppf(1.0-((1.0-conf)/2))
+  delta=z*stderr
+  lower=math.tanh(math.atanh(rho)-delta)
+  upper=math.tanh(math.atanh(rho)+delta)
+  return (lower,upper)
+
+def plot_ci(rhos,n,trim=None,conf=0.95):
+   # rhos are (rank) correlation values
+   rhos_s=rhos[(-rhos).argsort()]
+   if trim is None:
+     l=len(rhos)
+   else:
+     rhos_s=rhos_s[:trim]
+     l=trim
+   cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
+   ue=cc[1]-rhos_s
+   le=rhos_s-cc[0]
+   #for i in range(len(rhos)):
+     #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
+   plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
+   plt.show()
+
 def first_diff(ranks):
   # first disagreement with baseline == {1,2,...}
   for i in range(len(ranks)):