changeset 31:e7c8e64c2fdd

get multi-ranking done right
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 17 Nov 2022 13:51:19 +0000
parents c73ec9deabbe
children 91741bf3ab51
files bin/spearman.py
diffstat 1 files changed, 19 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/bin/spearman.py	Thu Nov 17 11:27:07 2022 +0000
+++ b/bin/spearman.py	Thu Nov 17 13:51:19 2022 +0000
@@ -45,7 +45,7 @@
   plt.gca().set_title('Worst segment (most variance): %s'%xworst)
   plt.show()
 
-def plot_x():
+def plot_x(block=True):
   plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments')
   plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means')
   plt.plot(all,'rx',label='Rank correlation of segment x whole crawl')
@@ -53,7 +53,7 @@
   plt.axis([0,99,0.8,1.0])
   plt.legend(loc='best')
   plt.grid(True)
-  plt.show()
+  plt.show(block=block)
 
 def hist():
   sdd=[(i,xm-(i*xsd)) for i in range(-2,3)]
@@ -79,7 +79,22 @@
 def ranks():
   # Combine segment measures:
   #  segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement
-  return np.array([i,all[i],1.0/xd[i].variance,xd[i].mean,first_diff(ranks[i])])
+  # convert to ranks, smallest value == highest rank
+  all_ranked=stats.rankdata(-all,method='average') # invert since
+                                                   #  large corr is good
+  x_variance_ranked=stats.rankdata([xd[i].variance for i in range(100)])
+                                                  # small corr variance is good
+  x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(100)])
+                                                   # invert since
+                                                   #  large mean corr is good
+  fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(100)])
+                                                   # invert since
+                                                   #  large first diff is good
+  return np.array([[i,
+                    all_ranked[i],
+                    x_variance_ranked[i],
+                    x_mean_ranked[i],
+                    fd_ranked[i]] for i in range(100)])
 
 counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
 # "If axis=0 (default), then each column represents a variable, with
@@ -101,7 +116,7 @@
 xm=xs.mean
 xsd=np.sqrt(xs.variance)
 
-ranks=[stats.rankdata(-counts[:,i],method='average') for for i in range(1,100)]
+x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,101)]
 
 ### I need to review rows, e.g. counts[0] is an array of 101 counts
 ###   for the most common label in the complete crawl,