changeset 32:91741bf3ab51

add sort flag to plot_x
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 22 Nov 2022 11:02:51 +0000
parents e7c8e64c2fdd
children 317bf47b506c
files bin/spearman.py
diffstat 1 files changed, 27 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/bin/spearman.py	Thu Nov 17 13:51:19 2022 +0000
+++ b/bin/spearman.py	Tue Nov 22 11:02:51 2022 +0000
@@ -45,12 +45,20 @@
   plt.gca().set_title('Worst segment (most variance): %s'%xworst)
   plt.show()
 
-def plot_x(block=True):
-  plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments')
-  plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means')
-  plt.plot(all,'rx',label='Rank correlation of segment x whole crawl')
-  plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl')
-  plt.axis([0,99,0.8,1.0])
+def plot_x(sort=False,block=True):
+  # Make these two subplots...
+  if sort:
+    aso=np.argsort(-all)
+    plot_all=all[aso]
+    plot_x=np.array([xd[i].mean for i in range(N)])[aso]
+  else:
+    plot_all=all
+    plot_x=[xd[i].mean for i in range(N)]
+  plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl')
+  plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl')
+  plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
+  plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
+  plt.axis([0,N-1,0.8,1.0])
   plt.legend(loc='best')
   plt.grid(True)
   plt.show(block=block)
@@ -59,7 +67,7 @@
   sdd=[(i,xm-(i*xsd)) for i in range(-2,3)]
   fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
   sdax=hax.twiny()
-  hax.hist([xd[i].mean for i in range(100)],color='lightblue')
+  hax.hist([xd[i].mean for i in range(N)],color='lightblue')
   hax.set_title('Mean of rank correlation of each segment x all other segments')
   for s,v in sdd:
        sdax.plot([v,v],[0,18],'b')
@@ -82,25 +90,26 @@
   # convert to ranks, smallest value == highest rank
   all_ranked=stats.rankdata(-all,method='average') # invert since
                                                    #  large corr is good
-  x_variance_ranked=stats.rankdata([xd[i].variance for i in range(100)])
+  x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)])
                                                   # small corr variance is good
-  x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(100)])
+  x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)])
                                                    # invert since
                                                    #  large mean corr is good
-  fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(100)])
+  fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)])
                                                    # invert since
                                                    #  large first diff is good
   return np.array([[i,
                     all_ranked[i],
                     x_variance_ranked[i],
                     x_mean_ranked[i],
-                    fd_ranked[i]] for i in range(100)])
+                    fd_ranked[i]] for i in range(N)])
 
 counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
+N=counts.shape[0]
 # "If axis=0 (default), then each column represents a variable, with
 #        observations in the rows"
 # So each column is a sequence of counts, for whole crawl in column 0
-#   and for segments 0--99 in columns 1--100
+#   and for segments 0--N-1 in columns 1--N
 corr=stats.spearmanr(counts,nan_policy='omit').correlation
 
 all=corr[0][1:]
@@ -108,18 +117,18 @@
 all_m=all_s.mean
 
 x=np.array([np.concatenate((corr[i][1:i],
-                            corr[i][i+1:])) for i in range(1,101)])
+                            corr[i][i+1:])) for i in range(1,N+1)])
 # The above, although transposed, works because the correlation matrix
 #  is symmetric
-xd=[stats.describe(x[i]) for i in range(100)]
-xs=stats.describe(np.array([xd[i].mean for i in range(100)]))
+xd=[stats.describe(x[i]) for i in range(N)]
+xs=stats.describe(np.array([xd[i].mean for i in range(N)]))
 xm=xs.mean
 xsd=np.sqrt(xs.variance)
 
-x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,101)]
+x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)]
 
-### I need to review rows, e.g. counts[0] is an array of 101 counts
+### I need to review rows, e.g. counts[0] is an array of N+1 counts
 ###   for the most common label in the complete crawl,
 ###   from the complete crawl and all the segments
-### versus columns, e.g. counts[:,0] is an array of 100 decreasing counts
+### versus columns, e.g. counts[:,0] is an array of N decreasing counts
 ###   for all the labels in the complete crawl