Mercurial > hg > cc > cirrus_work
comparison bin/spearman.py @ 34:2e8002a64f72
compute and graph confidence intervals
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 23 Nov 2022 11:05:45 +0000 |
parents | 317bf47b506c |
children | f76656fa98f7 |
comparison
equal
deleted
inserted
replaced
33:317bf47b506c | 34:2e8002a64f72 |
---|---|
21 from scipy import stats | 21 from scipy import stats |
22 import statsmodels.api as sm | 22 import statsmodels.api as sm |
23 import matplotlib.pyplot as plt | 23 import matplotlib.pyplot as plt |
24 import pylab | 24 import pylab |
25 | 25 |
26 import sys | 26 import sys, math |
27 | 27 |
28 def qqa(): | 28 def qqa(): |
29 # q-q plot for the whole crawl | 29 # q-q plot for the whole crawl |
30 sm.qqplot(all, line='s') | 30 sm.qqplot(all, line='s') |
31 plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)') | 31 plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)') |
44 sm.qqplot(x[xworst], line='s') | 44 sm.qqplot(x[xworst], line='s') |
45 plt.gca().set_title('Worst segment (most variance): %s'%xworst) | 45 plt.gca().set_title('Worst segment (most variance): %s'%xworst) |
46 plt.show() | 46 plt.show() |
47 | 47 |
48 def plot_x(sort=False,block=True): | 48 def plot_x(sort=False,block=True): |
49 # Make these two subplots... | 49 # Make these two subplots, w. and w/o sorting |
50 # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot | |
51 # for legend hacking | |
50 if sort: | 52 if sort: |
51 aso=np.argsort(-all) | 53 aso=np.argsort(-all) |
52 plot_all=all[aso] | 54 plot_all=all[aso] |
53 plot_x=np.array([xd[i].mean for i in range(N)])[aso] | 55 plot_x=np.array([xd[i].mean for i in range(N)])[aso] |
54 else: | 56 else: |
57 plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl') | 59 plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl') |
58 plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl') | 60 plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl') |
59 plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') | 61 plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') |
60 plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') | 62 plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') |
61 plt.axis([0,N-1,0.8,1.0]) | 63 plt.axis([0,N-1,0.8,1.0]) |
62 plt.legend(loc='best') | 64 plt.legend(loc='best',fontsize='small') |
63 plt.grid(True) | 65 plt.grid(True) |
64 plt.show(block=block) | 66 plt.show(block=block) |
65 | 67 |
66 def hist_x(): | 68 def hist_x(): |
67 hist(xm,xsd,[xd[i].mean for i in range(N)], | 69 hist(xm,xsd,[xd[i].mean for i in range(N)], |
82 sdax.set_xlim(hax.get_xlim()) | 84 sdax.set_xlim(hax.get_xlim()) |
83 sdax.set_ylim(hax.get_ylim()) | 85 sdax.set_ylim(hax.get_ylim()) |
84 sdax.set_xticks([v for s,v in sdd]) | 86 sdax.set_xticks([v for s,v in sdd]) |
85 sdax.set_xticklabels([str(s) for s,v in sdd]) | 87 sdax.set_xticklabels([str(s) for s,v in sdd]) |
86 plt.show() | 88 plt.show() |
89 | |
90 def ci(rho,n,conf=0.95): | |
91 # Courtesy of https://stats.stackexchange.com/a/18904 | |
92 # rho is (rank) correlation, n is sample size | |
93 stderr=1.0/math.sqrt(n-3) | |
94 z=stats.norm.ppf(1.0-((1.0-conf)/2)) | |
95 delta=z*stderr | |
96 lower=math.tanh(math.atanh(rho)-delta) | |
97 upper=math.tanh(math.atanh(rho)+delta) | |
98 return (lower,upper) | |
99 | |
100 def plot_ci(rhos,n,trim=None,conf=0.95): | |
101 # rhos are (rank) correlation values | |
102 rhos_s=rhos[(-rhos).argsort()] | |
103 if trim is None: | |
104 l=len(rhos) | |
105 else: | |
106 rhos_s=rhos_s[:trim] | |
107 l=trim | |
108 cc=(np.array([ci(r,n,conf) for r in rhos_s])).T | |
109 ue=cc[1]-rhos_s | |
110 le=rhos_s-cc[0] | |
111 #for i in range(len(rhos)): | |
112 #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) | |
113 plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') | |
114 plt.show() | |
87 | 115 |
88 def first_diff(ranks): | 116 def first_diff(ranks): |
89 # first disagreement with baseline == {1,2,...} | 117 # first disagreement with baseline == {1,2,...} |
90 for i in range(len(ranks)): | 118 for i in range(len(ranks)): |
91 if ranks[i]!=i+1.0: | 119 if ranks[i]!=i+1.0: |