comparison bin/spearman.py @ 34:2e8002a64f72

compute and graph confidence intervals
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 23 Nov 2022 11:05:45 +0000
parents 317bf47b506c
children f76656fa98f7
comparison
equal deleted inserted replaced
33:317bf47b506c 34:2e8002a64f72
21 from scipy import stats 21 from scipy import stats
22 import statsmodels.api as sm 22 import statsmodels.api as sm
23 import matplotlib.pyplot as plt 23 import matplotlib.pyplot as plt
24 import pylab 24 import pylab
25 25
26 import sys 26 import sys, math
27 27
28 def qqa(): 28 def qqa():
29 # q-q plot for the whole crawl 29 # q-q plot for the whole crawl
30 sm.qqplot(all, line='s') 30 sm.qqplot(all, line='s')
31 plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)') 31 plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)')
44 sm.qqplot(x[xworst], line='s') 44 sm.qqplot(x[xworst], line='s')
45 plt.gca().set_title('Worst segment (most variance): %s'%xworst) 45 plt.gca().set_title('Worst segment (most variance): %s'%xworst)
46 plt.show() 46 plt.show()
47 47
48 def plot_x(sort=False,block=True): 48 def plot_x(sort=False,block=True):
49 # Make these two subplots... 49 # Make these two subplots, w. and w/o sorting
50 # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
51 # for legend hacking
50 if sort: 52 if sort:
51 aso=np.argsort(-all) 53 aso=np.argsort(-all)
52 plot_all=all[aso] 54 plot_all=all[aso]
53 plot_x=np.array([xd[i].mean for i in range(N)])[aso] 55 plot_x=np.array([xd[i].mean for i in range(N)])[aso]
54 else: 56 else:
57 plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl') 59 plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl')
58 plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl') 60 plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl')
59 plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') 61 plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
60 plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') 62 plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
61 plt.axis([0,N-1,0.8,1.0]) 63 plt.axis([0,N-1,0.8,1.0])
62 plt.legend(loc='best') 64 plt.legend(loc='best',fontsize='small')
63 plt.grid(True) 65 plt.grid(True)
64 plt.show(block=block) 66 plt.show(block=block)
65 67
66 def hist_x(): 68 def hist_x():
67 hist(xm,xsd,[xd[i].mean for i in range(N)], 69 hist(xm,xsd,[xd[i].mean for i in range(N)],
82 sdax.set_xlim(hax.get_xlim()) 84 sdax.set_xlim(hax.get_xlim())
83 sdax.set_ylim(hax.get_ylim()) 85 sdax.set_ylim(hax.get_ylim())
84 sdax.set_xticks([v for s,v in sdd]) 86 sdax.set_xticks([v for s,v in sdd])
85 sdax.set_xticklabels([str(s) for s,v in sdd]) 87 sdax.set_xticklabels([str(s) for s,v in sdd])
86 plt.show() 88 plt.show()
89
90 def ci(rho,n,conf=0.95):
91 # Courtesy of https://stats.stackexchange.com/a/18904
92 # rho is (rank) correlation, n is sample size
93 stderr=1.0/math.sqrt(n-3)
94 z=stats.norm.ppf(1.0-((1.0-conf)/2))
95 delta=z*stderr
96 lower=math.tanh(math.atanh(rho)-delta)
97 upper=math.tanh(math.atanh(rho)+delta)
98 return (lower,upper)
99
100 def plot_ci(rhos,n,trim=None,conf=0.95):
101 # rhos are (rank) correlation values
102 rhos_s=rhos[(-rhos).argsort()]
103 if trim is None:
104 l=len(rhos)
105 else:
106 rhos_s=rhos_s[:trim]
107 l=trim
108 cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
109 ue=cc[1]-rhos_s
110 le=rhos_s-cc[0]
111 #for i in range(len(rhos)):
112 #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
113 plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
114 plt.show()
87 115
88 def first_diff(ranks): 116 def first_diff(ranks):
89 # first disagreement with baseline == {1,2,...} 117 # first disagreement with baseline == {1,2,...}
90 for i in range(len(ranks)): 118 for i in range(len(ranks)):
91 if ranks[i]!=i+1.0: 119 if ranks[i]!=i+1.0: