Mercurial > hg > cc > cirrus_work
comparison bin/spearman.py @ 31:e7c8e64c2fdd
get multi-ranking done right
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 17 Nov 2022 13:51:19 +0000 |
parents | c73ec9deabbe |
children | 91741bf3ab51 |
comparison
equal
deleted
inserted
replaced
30:c73ec9deabbe | 31:e7c8e64c2fdd |
---|---|
43 plt.show() | 43 plt.show() |
44 sm.qqplot(x[xworst], line='s') | 44 sm.qqplot(x[xworst], line='s') |
45 plt.gca().set_title('Worst segment (most variance): %s'%xworst) | 45 plt.gca().set_title('Worst segment (most variance): %s'%xworst) |
46 plt.show() | 46 plt.show() |
47 | 47 |
48 def plot_x(): | 48 def plot_x(block=True): |
49 plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments') | 49 plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments') |
50 plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means') | 50 plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means') |
51 plt.plot(all,'rx',label='Rank correlation of segment x whole crawl') | 51 plt.plot(all,'rx',label='Rank correlation of segment x whole crawl') |
52 plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl') | 52 plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl') |
53 plt.axis([0,99,0.8,1.0]) | 53 plt.axis([0,99,0.8,1.0]) |
54 plt.legend(loc='best') | 54 plt.legend(loc='best') |
55 plt.grid(True) | 55 plt.grid(True) |
56 plt.show() | 56 plt.show(block=block) |
57 | 57 |
58 def hist(): | 58 def hist(): |
59 sdd=[(i,xm-(i*xsd)) for i in range(-2,3)] | 59 sdd=[(i,xm-(i*xsd)) for i in range(-2,3)] |
60 fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 | 60 fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 |
61 sdax=hax.twiny() | 61 sdax=hax.twiny() |
77 return i+1 | 77 return i+1 |
78 | 78 |
79 def ranks(): | 79 def ranks(): |
80 # Combine segment measures: | 80 # Combine segment measures: |
81 # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement | 81 # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement |
82 return np.array([i,all[i],1.0/xd[i].variance,xd[i].mean,first_diff(ranks[i])]) | 82 # convert to ranks, smallest value == highest rank |
83 all_ranked=stats.rankdata(-all,method='average') # invert since | |
84 # large corr is good | |
85 x_variance_ranked=stats.rankdata([xd[i].variance for i in range(100)]) | |
86 # small corr variance is good | |
87 x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(100)]) | |
88 # invert since | |
89 # large mean corr is good | |
90 fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(100)]) | |
91 # invert since | |
92 # large first diff is good | |
93 return np.array([[i, | |
94 all_ranked[i], | |
95 x_variance_ranked[i], | |
96 x_mean_ranked[i], | |
97 fd_ranked[i]] for i in range(100)]) | |
83 | 98 |
84 counts=loadtxt(sys.argv[1]+".csv",delimiter=',') | 99 counts=loadtxt(sys.argv[1]+".csv",delimiter=',') |
85 # "If axis=0 (default), then each column represents a variable, with | 100 # "If axis=0 (default), then each column represents a variable, with |
86 # observations in the rows" | 101 # observations in the rows" |
87 # So each column is a sequence of counts, for whole crawl in column 0 | 102 # So each column is a sequence of counts, for whole crawl in column 0 |
99 xd=[stats.describe(x[i]) for i in range(100)] | 114 xd=[stats.describe(x[i]) for i in range(100)] |
100 xs=stats.describe(np.array([xd[i].mean for i in range(100)])) | 115 xs=stats.describe(np.array([xd[i].mean for i in range(100)])) |
101 xm=xs.mean | 116 xm=xs.mean |
102 xsd=np.sqrt(xs.variance) | 117 xsd=np.sqrt(xs.variance) |
103 | 118 |
104 ranks=[stats.rankdata(-counts[:,i],method='average') for for i in range(1,100)] | 119 x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,101)] |
105 | 120 |
106 ### I need to review rows, e.g. counts[0] is an array of 101 counts | 121 ### I need to review rows, e.g. counts[0] is an array of 101 counts |
107 ### for the most common label in the complete crawl, | 122 ### for the most common label in the complete crawl, |
108 ### from the complete crawl and all the segments | 123 ### from the complete crawl and all the segments |
109 ### versus columns, e.g. counts[:,0] is an array of 100 decreasing counts | 124 ### versus columns, e.g. counts[:,0] is an array of 100 decreasing counts |