Mercurial > hg > cc > cirrus_work
changeset 31:e7c8e64c2fdd
get multi-ranking done right
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 17 Nov 2022 13:51:19 +0000 |
parents | c73ec9deabbe |
children | 91741bf3ab51 |
files | bin/spearman.py |
diffstat | 1 files changed, 19 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/spearman.py Thu Nov 17 11:27:07 2022 +0000 +++ b/bin/spearman.py Thu Nov 17 13:51:19 2022 +0000 @@ -45,7 +45,7 @@ plt.gca().set_title('Worst segment (most variance): %s'%xworst) plt.show() -def plot_x(): +def plot_x(block=True): plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments') plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means') plt.plot(all,'rx',label='Rank correlation of segment x whole crawl') @@ -53,7 +53,7 @@ plt.axis([0,99,0.8,1.0]) plt.legend(loc='best') plt.grid(True) - plt.show() + plt.show(block=block) def hist(): sdd=[(i,xm-(i*xsd)) for i in range(-2,3)] @@ -79,7 +79,22 @@ def ranks(): # Combine segment measures: # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement - return np.array([i,all[i],1.0/xd[i].variance,xd[i].mean,first_diff(ranks[i])]) + # convert to ranks, smallest value == highest rank + all_ranked=stats.rankdata(-all,method='average') # invert since + # large corr is good + x_variance_ranked=stats.rankdata([xd[i].variance for i in range(100)]) + # small corr variance is good + x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(100)]) + # invert since + # large mean corr is good + fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(100)]) + # invert since + # large first diff is good + return np.array([[i, + all_ranked[i], + x_variance_ranked[i], + x_mean_ranked[i], + fd_ranked[i]] for i in range(100)]) counts=loadtxt(sys.argv[1]+".csv",delimiter=',') # "If axis=0 (default), then each column represents a variable, with @@ -101,7 +116,7 @@ xm=xs.mean xsd=np.sqrt(xs.variance) -ranks=[stats.rankdata(-counts[:,i],method='average') for for i in range(1,100)] +x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,101)] ### I need to review rows, e.g. counts[0] is an array of 101 counts ### for the most common label in the complete crawl,