changeset 29:669a0b120d34

start work on ranking, lose faith in getting row vs. column correct every time
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 16 Nov 2022 19:52:50 +0000
parents 7ffb686ca060
children c73ec9deabbe
files bin/spearman.py
diffstat 1 files changed, 25 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/bin/spearman.py	Wed Nov 16 17:29:55 2022 +0000
+++ b/bin/spearman.py	Wed Nov 16 19:52:50 2022 +0000
@@ -67,17 +67,37 @@
   sdax.set_xticklabels([str(s) for s,v in sdd])
   plt.show()
 
+def first_diff(ranks):
+  # first disagreement with baseline == {1,2,...}
+  for i in range(len(ranks)):
+    if ranks[i]!=i+1.0:
+      return i
+  return i+1
+
+def ranks():
+  # Combine segment measures:
+  #  segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement
+  return np.array([i,all[i],1.0/xd[i].variance,xd[i].mean,first_diff(ranks[i])])
+
 counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
-o=stats.spearmanr(counts,nan_policy='omit')
+# "If axis=0 (default), then each column represents a variable, with
+#        observations in the rows"
+ranks=[stats.rankdata(-counts[i],method='average') for for i in range(1,100)]
+corr=stats.spearmanr(counts,nan_policy='omit').correlation
 
-all=o.correlation[0][1:]
+all=corr[0][1:]
 all_s=stats.describe(all)
 all_m=all_s.mean
-# Should get the confidence interval for this, so we can
-#  use it in plot_x
 
-x=np.array([np.concatenate((o.correlation[i][1:i],o.correlation[i][i+1:])) for i in range(1,101)])
+x=np.array([np.concatenate((corr[i][1:i],
+                            corr[i][i+1:])) for i in range(1,101)])
 xd=[stats.describe(x[i]) for i in range(100)]
 xs=stats.describe(np.array([xd[i].mean for i in range(100)]))
 xm=xs.mean
 xsd=np.sqrt(xs.variance)
+
+### I need to review rows, e.g. counts[0] is an array of 101 counts
+###   for the most common label in the complete crawl,
+###   from the complete crawl and all the segments
+### versus columns, e.g. counts[:,0] is an array of 100 decreasing counts
+###   for all the labels in the complete crawl