changeset 177:a5d54736a77f

renamed from spearman.py
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 24 Nov 2023 20:39:08 +0000
parents 08e903ed5a98
children c42a5f7c97c5
files lib/python/cc/stats.py
diffstat 1 files changed, 192 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/stats.py	Fri Nov 24 20:39:08 2023 +0000
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+'''Rank correlation processing for a csv tabulation of counts by segment 
+   First column is for whole crawl, then 100 columns for segs 0-99
+   Each row is counts for some property, e.g. mime-detected or tld
+
+   For example, assuming ALL.tsv has the whole-crawl warc-only counts
+   and s...tsv have the segment counts, ALL with counts in column 1,
+
+   tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
+
+   will produce such a file with
+     * 100 rows, one for each of the top 100 counts
+     * 101 columns, 0 for ALL and 1--100 for segs 0--99
+
+   Usage: python3 -i spearman.py name id
+     where name.csv has the input
+'''
+     
+import numpy as np
+from numpy import loadtxt
+from scipy import stats
+import statsmodels.api as sm
+import matplotlib.pyplot as plt
+import pylab
+
+import sys, math
+
+def qqa():
+  # q-q plot for the whole crawl
+  sm.qqplot(ALL, line='s')
+  plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id)
+  plt.show()
+
+def qqs():
+  # q-q plots for the best and worst (by variance) segments
+  global xv, xworst, xbest
+  xv=[d.variance for d in xd]
+  xworst=xv.index(max(xv))
+  xbest=xv.index(min(xv))
+  print(xbest,xworst)
+  sm.qqplot(x[xbest], line='s')
+  plt.gca().set_title('Best segment (least variance): %s'%xbest)
+  plt.show()
+  sm.qqplot(x[xworst], line='s')
+  plt.gca().set_title('Worst segment (most variance): %s'%xworst)
+  plt.show()
+
+def plot_x(sort=False,block=True,all_only=True,title=None):
+  # Make these two subplots, w. and w/o sorting
+  # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
+  #  for legend hacking
+  if sort:
+    aso=np.argsort(-ALL)
+    plot_all=ALL[aso]
+    plot_x=np.array([xd[i].mean for i in range(N)])[aso]
+  else:
+    plot_all=ALL
+    plot_x=[xd[i].mean for i in range(N)]
+  if title is None:
+    l1='Rank correlation of segment x whole crawl'
+    l2='Mean of segment x whole crawl'
+    plt.legend(loc='best',fontsize='small')
+  else:
+    l1=l2=None
+  plt.plot(plot_all,'rx',label=l1)
+  plt.plot([0,N-1],[all_m,all_m],'r',label=l2)
+  if not(all_only):
+    plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x ALL other segments')
+    plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
+  plt.axis([0,N-1,0.85 if all_only else 0.8,1.0])
+  plt.grid(True)
+  if title is not None:
+    plt.title(title)
+  plt.show(block=block)
+
+def hist_x(align='mid'):
+  hist(xm,xsd,[xd[i].mean for i in range(N)],
+       'Mean of rank correlation of each segment x ALL other segments',
+       align)
+
+def hist_all(align='mid'):
+  hist(all_m,np.sqrt(all_s.variance),ALL,
+       'Rank correlation of each segment x whole crawl %s'%id,
+       align)
+
+def hist(m,sd,hh,title,align):
+  sdd=[(i,m-(i*sd)) for i in range(-2,3)]
+  fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
+  sdax=hax.twiny()
+  hax.hist(hh,color='lightblue',align=align)
+  hax.set_title(title)
+  for s,v in sdd:
+       sdax.plot([v,v],[0,18],'b')
+  sdax.set_xlim(hax.get_xlim())
+  sdax.set_ylim(hax.get_ylim())
+  sdax.set_xticks([v for s,v in sdd])
+  sdax.set_xticklabels([str(s) for s,v in sdd])
+  plt.show()
+
+def ci(rho,n,conf=0.95):
+  # Courtesy of https://stats.stackexchange.com/a/18904
+  # rho is (rank) correlation, n is sample size
+  stderr=1.0/math.sqrt(n-3)
+  z=stats.norm.ppf(1.0-((1.0-conf)/2))
+  delta=z*stderr
+  lower=math.tanh(math.atanh(rho)-delta)
+  upper=math.tanh(math.atanh(rho)+delta)
+  return (lower,upper)
+
+def plot_ci(rhos,n,trim=None,conf=0.95):
+   # rhos are (rank) correlation values
+   rhos_s=rhos[(-rhos).argsort()]
+   if trim is None:
+     l=len(rhos)
+   else:
+     rhos_s=rhos_s[:trim]
+     l=trim
+   cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
+   ue=cc[1]-rhos_s
+   le=rhos_s-cc[0]
+   #for i in range(len(rhos)):
+     #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
+   plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
+   plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100))
+   plt.show()
+
+def first_diff(ranks):
+  # first disagreement with baseline == {1,2,...}
+  for i in range(len(ranks)):
+    if ranks[i]!=i+1.0:
+      return i
+  return i+1
+
+def ranks():
+  # Combine segment measures:
+  #  segID,rank corr. wrt ALL,inverse variance, mean cross rank corr.,first disagreement
+  # convert to ranks, smallest value == highest rank
+  all_ranked=stats.rankdata(-ALL,method='average') # invert since
+                                                   #  large corr is good
+  x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)])
+                                                  # small corr variance is good
+  x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)])
+                                                   # invert since
+                                                   #  large mean corr is good
+  fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)])
+                                                   # invert since
+                                                   #  large first diff is good
+  return np.array([[i,
+                    all_ranked[i],
+                    x_variance_ranked[i],
+                    x_mean_ranked[i],
+                    fd_ranked[i]] for i in range(N)])
+
+def main(file,id):
+  # this should be a class with all these globals as instance vars
+  global counts, corr, ALL, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr
+  global aa, aa_by_all, N
+  counts=loadtxt(file+".csv",delimiter=',')
+  N=counts.shape[1]-1
+  # "If axis=0 (default), then each column represents a variable, with
+  #        observations in the rows"
+  # So each column is a sequence of counts, for whole crawl in column 0
+  #   and for segments 0--N-1 in columns 1--N
+  corr=stats.spearmanr(counts,nan_policy='omit').correlation
+
+  ALL=corr[0][1:]
+  all_s=stats.describe(ALL)
+  all_m=all_s.mean
+
+  x=np.array([np.concatenate((corr[i][1:i],
+                              corr[i][i+1:])) for i in range(1,N+1)])
+  # The above, although transposed, works because the correlation matrix
+  #  is symmetric
+  xd=[stats.describe(x[i]) for i in range(N)]
+  xs=stats.describe(np.array([xd[i].mean for i in range(N)]))
+  xm=xs.mean
+  xsd=np.sqrt(xs.variance)
+
+  x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)]
+
+  aa=ranks()
+  aa_by_all=aa[aa[:,1].argsort()]
+
+if __name__ == '__main__':
+  sys.exit(main(*sys.argv[1:]))
+
+
+### I need to review rows, e.g. counts[0] is an array of N+1 counts
+###   for the most common label in the complete crawl,
+###   from the complete crawl and ALL the segments
+### versus columns, e.g. counts[:,0] is an array of N decreasing counts
+###   for all the labels in the complete crawl