# HG changeset patch
# User Henry S. Thompson <ht@inf.ed.ac.uk>
# Date 1669201545 0
# Node ID 2e8002a64f729c41cb7495873ad99d6ebb1af6b6
# Parent  317bf47b506cda5602fa630e4bd073d4c58947a8
compute and graph confidence intervals

diff -r 317bf47b506c -r 2e8002a64f72 bin/spearman.py
--- a/bin/spearman.py	Tue Nov 22 19:13:25 2022 +0000
+++ b/bin/spearman.py	Wed Nov 23 11:05:45 2022 +0000
@@ -23,7 +23,7 @@
 import matplotlib.pyplot as plt
 import pylab
 
-import sys
+import sys, math
 
 def qqa():
   # q-q plot for the whole crawl
@@ -46,7 +46,9 @@
   plt.show()
 
 def plot_x(sort=False,block=True):
-  # Make these two subplots...
+  # Make these two subplots, w. and w/o sorting
+  # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
+  #  for legend hacking
   if sort:
     aso=np.argsort(-all)
     plot_all=all[aso]
@@ -59,7 +61,7 @@
   plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
   plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
   plt.axis([0,N-1,0.8,1.0])
-  plt.legend(loc='best')
+  plt.legend(loc='best',fontsize='small')
   plt.grid(True)
   plt.show(block=block)
 
@@ -85,6 +87,32 @@
   sdax.set_xticklabels([str(s) for s,v in sdd])
   plt.show()
 
+def ci(rho,n,conf=0.95):
+  # Courtesy of https://stats.stackexchange.com/a/18904
+  # rho is (rank) correlation, n is sample size
+  stderr=1.0/math.sqrt(n-3)
+  z=stats.norm.ppf(1.0-((1.0-conf)/2))
+  delta=z*stderr
+  lower=math.tanh(math.atanh(rho)-delta)
+  upper=math.tanh(math.atanh(rho)+delta)
+  return (lower,upper)
+
+def plot_ci(rhos,n,trim=None,conf=0.95):
+   # rhos are (rank) correlation values
+   rhos_s=rhos[(-rhos).argsort()]
+   if trim is None:
+     l=len(rhos)
+   else:
+     rhos_s=rhos_s[:trim]
+     l=trim
+   cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
+   ue=cc[1]-rhos_s
+   le=rhos_s-cc[0]
+   #for i in range(len(rhos)):
+     #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
+   plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
+   plt.show()
+
 def first_diff(ranks):
   # first disagreement with baseline == {1,2,...}
   for i in range(len(ranks)):