Mercurial > hg > cc > cirrus_work

--- a/lib/python/cc/spearman.py	Fri Nov 24 19:52:52 2023 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,192 +0,0 @@
-#!/usr/bin/env python3
-'''Rank correlation processing for a csv tabulation of counts by segment
-   First column is for whole crawl, then 100 columns for segs 0-99
-   Each row is counts for some property, e.g. mime-detected or tld
-
-   For example, assuming ALL.tsv has the whole-crawl warc-only counts
-   and s...tsv have the segment counts, ALL with counts in column 1,
-
-   tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
-
-   will produce such a file with
-     * 100 rows, one for each of the top 100 counts
-     * 101 columns, 0 for ALL and 1--100 for segs 0--99
-
-   Usage: python3 -i spearman.py name id
-     where name.csv has the input
-'''
-
-import numpy as np
-from numpy import loadtxt
-from scipy import stats
-import statsmodels.api as sm
-import matplotlib.pyplot as plt
-import pylab
-
-import sys, math
-
-def qqa():
-  # q-q plot for the whole crawl
-  sm.qqplot(ALL, line='s')
-  plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id)
-  plt.show()
-
-def qqs():
-  # q-q plots for the best and worst (by variance) segments
-  global xv, xworst, xbest
-  xv=[d.variance for d in xd]
-  xworst=xv.index(max(xv))
-  xbest=xv.index(min(xv))
-  print(xbest,xworst)
-  sm.qqplot(x[xbest], line='s')
-  plt.gca().set_title('Best segment (least variance): %s'%xbest)
-  plt.show()
-  sm.qqplot(x[xworst], line='s')
-  plt.gca().set_title('Worst segment (most variance): %s'%xworst)
-  plt.show()
-
-def plot_x(sort=False,block=True,all_only=True,title=None):
-  # Make these two subplots, w. and w/o sorting
-  # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
-  #  for legend hacking
-  if sort:
-    aso=np.argsort(-ALL)
-    plot_all=ALL[aso]
-    plot_x=np.array([xd[i].mean for i in range(N)])[aso]
-  else:
-    plot_all=ALL
-    plot_x=[xd[i].mean for i in range(N)]
-  if title is None:
-    l1='Rank correlation of segment x whole crawl'
-    l2='Mean of segment x whole crawl'
-    plt.legend(loc='best',fontsize='small')
-  else:
-    l1=l2=None
-  plt.plot(plot_all,'rx',label=l1)
-  plt.plot([0,N-1],[all_m,all_m],'r',label=l2)
-  if not(all_only):
-    plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x ALL other segments')
-    plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
-  plt.axis([0,N-1,0.85 if all_only else 0.8,1.0])
-  plt.grid(True)
-  if title is not None:
-    plt.title(title)
-  plt.show(block=block)
-
-def hist_x(align='mid'):
-  hist(xm,xsd,[xd[i].mean for i in range(N)],
-       'Mean of rank correlation of each segment x ALL other segments',
-       align)
-
-def hist_all(align='mid'):
-  hist(all_m,np.sqrt(all_s.variance),ALL,
-       'Rank correlation of each segment x whole crawl %s'%id,
-       align)
-
-def hist(m,sd,hh,title,align):
-  sdd=[(i,m-(i*sd)) for i in range(-2,3)]
-  fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
-  sdax=hax.twiny()
-  hax.hist(hh,color='lightblue',align=align)
-  hax.set_title(title)
-  for s,v in sdd:
-       sdax.plot([v,v],[0,18],'b')
-  sdax.set_xlim(hax.get_xlim())
-  sdax.set_ylim(hax.get_ylim())
-  sdax.set_xticks([v for s,v in sdd])
-  sdax.set_xticklabels([str(s) for s,v in sdd])
-  plt.show()
-
-def ci(rho,n,conf=0.95):
-  # Courtesy of https://stats.stackexchange.com/a/18904
-  # rho is (rank) correlation, n is sample size
-  stderr=1.0/math.sqrt(n-3)
-  z=stats.norm.ppf(1.0-((1.0-conf)/2))
-  delta=z*stderr
-  lower=math.tanh(math.atanh(rho)-delta)
-  upper=math.tanh(math.atanh(rho)+delta)
-  return (lower,upper)
-
-def plot_ci(rhos,n,trim=None,conf=0.95):
-   # rhos are (rank) correlation values
-   rhos_s=rhos[(-rhos).argsort()]
-   if trim is None:
-     l=len(rhos)
-   else:
-     rhos_s=rhos_s[:trim]
-     l=trim
-   cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
-   ue=cc[1]-rhos_s
-   le=rhos_s-cc[0]
-   #for i in range(len(rhos)):
-     #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
-   plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
-   plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100))
-   plt.show()
-
-def first_diff(ranks):
-  # first disagreement with baseline == {1,2,...}
-  for i in range(len(ranks)):
-    if ranks[i]!=i+1.0:
-      return i
-  return i+1
-
-def ranks():
-  # Combine segment measures:
-  #  segID,rank corr. wrt ALL,inverse variance, mean cross rank corr.,first disagreement
-  # convert to ranks, smallest value == highest rank
-  all_ranked=stats.rankdata(-ALL,method='average') # invert since
-                                                   #  large corr is good
-  x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)])
-                                                  # small corr variance is good
-  x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)])
-                                                   # invert since
-                                                   #  large mean corr is good
-  fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)])
-                                                   # invert since
-                                                   #  large first diff is good
-  return np.array([[i,
-                    all_ranked[i],
-                    x_variance_ranked[i],
-                    x_mean_ranked[i],
-                    fd_ranked[i]] for i in range(N)])
-
-def main(file,id):
-  # this should be a class with all these globals as instance vars
-  global counts, corr, ALL, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr
-  global aa, aa_by_all, N
-  counts=loadtxt(file+".csv",delimiter=',')
-  N=counts.shape[1]-1
-  # "If axis=0 (default), then each column represents a variable, with
-  #        observations in the rows"
-  # So each column is a sequence of counts, for whole crawl in column 0
-  #   and for segments 0--N-1 in columns 1--N
-  corr=stats.spearmanr(counts,nan_policy='omit').correlation
-
-  ALL=corr[0][1:]
-  all_s=stats.describe(ALL)
-  all_m=all_s.mean
-
-  x=np.array([np.concatenate((corr[i][1:i],
-                              corr[i][i+1:])) for i in range(1,N+1)])
-  # The above, although transposed, works because the correlation matrix
-  #  is symmetric
-  xd=[stats.describe(x[i]) for i in range(N)]
-  xs=stats.describe(np.array([xd[i].mean for i in range(N)]))
-  xm=xs.mean
-  xsd=np.sqrt(xs.variance)
-
-  x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)]
-
-  aa=ranks()
-  aa_by_all=aa[aa[:,1].argsort()]
-
-if __name__ == '__main__':
-  sys.exit(main(*sys.argv[1:]))
-
-
-### I need to review rows, e.g. counts[0] is an array of N+1 counts
-###   for the most common label in the complete crawl,
-###   from the complete crawl and ALL the segments
-### versus columns, e.g. counts[:,0] is an array of N decreasing counts
-###   for all the labels in the complete crawl