cc/cirrus_work: bin/spearman.py comparison

comparison bin/spearman.py @ 34:2e8002a64f72

compute and graph confidence intervals

author	Henry S. Thompson <ht@inf.ed.ac.uk>
date	Wed, 23 Nov 2022 11:05:45 +0000
parents	317bf47b506c
children	f76656fa98f7

comparison

equal deleted inserted replaced

-:317bf47b506c
+:2e8002a64f72
 from scipy import stats
 import statsmodels.api as sm
 import matplotlib.pyplot as plt
 import pylab
-import sys
+import sys, math
 def qqa():
 # q-q plot for the whole crawl
 sm.qqplot(all, line='s')
 plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)')
 sm.qqplot(x[xworst], line='s')
 plt.gca().set_title('Worst segment (most variance): %s'%xworst)
 plt.show()
 def plot_x(sort=False,block=True):
-# Make these two subplots...
+# Make these two subplots, w. and w/o sorting
+# See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
+#  for legend hacking
 if sort:
 aso=np.argsort(-all)
 plot_all=all[aso]
 plot_x=np.array([xd[i].mean for i in range(N)])[aso]
 else:
 plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl')
 plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl')
 plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
 plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
 plt.axis([0,N-1,0.8,1.0])
-plt.legend(loc='best')
+plt.legend(loc='best',fontsize='small')
 plt.grid(True)
 plt.show(block=block)
 def hist_x():
 hist(xm,xsd,[xd[i].mean for i in range(N)],
 sdax.set_xlim(hax.get_xlim())
 sdax.set_ylim(hax.get_ylim())
 sdax.set_xticks([v for s,v in sdd])
 sdax.set_xticklabels([str(s) for s,v in sdd])
 plt.show()
+def ci(rho,n,conf=0.95):
+# Courtesy of https://stats.stackexchange.com/a/18904
+# rho is (rank) correlation, n is sample size
+stderr=1.0/math.sqrt(n-3)
+z=stats.norm.ppf(1.0-((1.0-conf)/2))
+delta=z*stderr
+lower=math.tanh(math.atanh(rho)-delta)
+upper=math.tanh(math.atanh(rho)+delta)
+return (lower,upper)
+def plot_ci(rhos,n,trim=None,conf=0.95):
+# rhos are (rank) correlation values
+rhos_s=rhos[(-rhos).argsort()]
+if trim is None:
+l=len(rhos)
+else:
+rhos_s=rhos_s[:trim]
+l=trim
+cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
+ue=cc[1]-rhos_s
+le=rhos_s-cc[0]
+#for i in range(len(rhos)):
+#print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
+plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
+plt.show()
 def first_diff(ranks):
 # first disagreement with baseline == {1,2,...}
 for i in range(len(ranks)):
 if ranks[i]!=i+1.0:

Mercurial > hg > cc > cirrus_work

comparison bin/spearman.py @ 34:2e8002a64f72