Mercurial > hg > cc > cirrus_work

--- a/bin/spearman.py	Tue Nov 15 19:37:28 2022 +0000
+++ b/bin/spearman.py	Wed Nov 16 17:28:56 2022 +0000
@@ -1,4 +1,19 @@
 #!/usr/bin/env python3
+'''Rank correlation processing for a csv tabulation of counts by segment
+   First column is for whole crawl, then 100 columns for segs 0-99
+   Each row is counts for some property, e.g. mime-detected or tld
+
+   For example
+
+   tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
+
+   will produce such a file with 100 rows assuming all.tsv has the whole-crawl
+   warc-only counts and s...tsv have the segment counts, all counts in column 1
+
+   Usage: python3 -i spearman.py name
+     where name.csv has the input
+'''
+
 import numpy as np
 from numpy import loadtxt
 from scipy import stats
@@ -8,7 +23,61 @@

 import sys

-cc19=loadtxt(sys.argv[1],delimiter=',')
-cc19s_o=stats.spearmanr(cc19,nan_policy='omit')
-cc19s_x=np.array([np.concatenate((cc19s_o.correlation[i][1:i],cc19s_o.correlation[i][i+1:])) for i in range(1,101)])
-cc19s_xd=[stats.describe(cc19s_x[i]) for i in range(100)]
+def qqa():
+  # q-q plot for the whole crawl
+  sm.qqplot(all, line='s')
+  plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)')
+  plt.show()
+
+def qqs():
+  # q-q plots for the best and worst (by variance) segments
+  global xv, xworst, xbest
+  xv=[d.variance for d in xd]
+  xworst=xv.index(max(xv))
+  xbest=xv.index(min(xv))
+  print(xbest,xworst)
+  sm.qqplot(x[xbest], line='s')
+  plt.gca().set_title('Best segment (least variance): %s'%xbest)
+  plt.show()
+  sm.qqplot(x[xworst], line='s')
+  plt.gca().set_title('Worst segment (most variance): %s'%xworst)
+  plt.show()
+
+def plot_x():
+  plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments')
+  plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means')
+  plt.plot(all,'rx',label='Rank correlation of segment x whole crawl')
+  plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl')
+  plt.axis([0,99,0.8,1.0])
+  plt.legend(loc='best')
+  plt.grid(True)
+  plt.show()
+
+def hist():
+  sdd=[(i,xm-(i*xsd)) for i in range(-2,3)]
+  fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
+  sdax=hax.twiny()
+  hax.hist([xd[i].mean for i in range(100)],color='lightblue')
+  hax.set_title('Mean of rank correlation of each segment x all other segments')
+  for s,v in sdd:
+       sdax.plot([v,v],[0,18],'b')
+  sdax.set_xlim(hax.get_xlim())
+  sdax.set_ylim(hax.get_ylim())
+  sdax.set_xticks([v for s,v in sdd])
+  sdax.set_xticklabels([str(s) for s,v in sdd])
+  plt.show()
+
+counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
+o=stats.spearmanr(counts,nan_policy='omit')
+
+all=o.correlation[0][1:]
+all_s=stats.describe(all)
+all_m=all_s.mean
+# Should get the confidence interval for this, so we can
+#  use it in plot_x
+
+x=np.array([np.concatenate((o.correlation[i][1:i],o.correlation[i][i+1:])) for i in range(1,101)])
+xd=[stats.describe(x[i]) for i in range(100)]
+xs=stats.describe(np.array([xd[i].mean for i in range(100)]))
+xm=xs.mean
+xsd=np.sqrt(xs.variance)