view bin/spearman.py @ 33:317bf47b506c

generalise hist
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 22 Nov 2022 19:13:25 +0000
parents 91741bf3ab51
children 2e8002a64f72
line wrap: on
line source

#!/usr/bin/env python3
'''Rank correlation processing for a csv tabulation of counts by segment 
   First column is for whole crawl, then 100 columns for segs 0-99
   Each row is counts for some property, e.g. mime-detected or tld

   For example, assuming all.tsv has the whole-crawl warc-only counts
   and s...tsv have the segment counts, all with counts in column 1,

   tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv

   will produce such a file with
     * 100 rows, one for each of the top 100 counts
     * 101 columns, 0 for all and 1--100 for segs 0--99

   Usage: python3 -i spearman.py name
     where name.csv has the input
'''
     
import numpy as np
from numpy import loadtxt
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pylab

import sys

def qqa():
  # q-q plot for the whole crawl
  sm.qqplot(all, line='s')
  plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)')
  plt.show()

def qqs():
  # q-q plots for the best and worst (by variance) segments
  global xv, xworst, xbest
  xv=[d.variance for d in xd]
  xworst=xv.index(max(xv))
  xbest=xv.index(min(xv))
  print(xbest,xworst)
  sm.qqplot(x[xbest], line='s')
  plt.gca().set_title('Best segment (least variance): %s'%xbest)
  plt.show()
  sm.qqplot(x[xworst], line='s')
  plt.gca().set_title('Worst segment (most variance): %s'%xworst)
  plt.show()

def plot_x(sort=False,block=True):
  # Make these two subplots...
  if sort:
    aso=np.argsort(-all)
    plot_all=all[aso]
    plot_x=np.array([xd[i].mean for i in range(N)])[aso]
  else:
    plot_all=all
    plot_x=[xd[i].mean for i in range(N)]
  plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl')
  plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl')
  plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
  plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
  plt.axis([0,N-1,0.8,1.0])
  plt.legend(loc='best')
  plt.grid(True)
  plt.show(block=block)

def hist_x():
  hist(xm,xsd,[xd[i].mean for i in range(N)],
       'Mean of rank correlation of each segment x all other segments')

def hist_all():
  hist(all_m,np.sqrt(all_s.variance),all,
       'Rank correlation of each segment x whole crawl')

def hist(m,sd,hh,title):
  sdd=[(i,m-(i*sd)) for i in range(-2,3)]
  fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
  sdax=hax.twiny()
  hax.hist(hh,color='lightblue')
  hax.set_title(title)
  for s,v in sdd:
       sdax.plot([v,v],[0,18],'b')
  sdax.set_xlim(hax.get_xlim())
  sdax.set_ylim(hax.get_ylim())
  sdax.set_xticks([v for s,v in sdd])
  sdax.set_xticklabels([str(s) for s,v in sdd])
  plt.show()

def first_diff(ranks):
  # first disagreement with baseline == {1,2,...}
  for i in range(len(ranks)):
    if ranks[i]!=i+1.0:
      return i
  return i+1

def ranks():
  # Combine segment measures:
  #  segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement
  # convert to ranks, smallest value == highest rank
  all_ranked=stats.rankdata(-all,method='average') # invert since
                                                   #  large corr is good
  x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)])
                                                  # small corr variance is good
  x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)])
                                                   # invert since
                                                   #  large mean corr is good
  fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)])
                                                   # invert since
                                                   #  large first diff is good
  return np.array([[i,
                    all_ranked[i],
                    x_variance_ranked[i],
                    x_mean_ranked[i],
                    fd_ranked[i]] for i in range(N)])

counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
N=counts.shape[0]
# "If axis=0 (default), then each column represents a variable, with
#        observations in the rows"
# So each column is a sequence of counts, for whole crawl in column 0
#   and for segments 0--N-1 in columns 1--N
corr=stats.spearmanr(counts,nan_policy='omit').correlation

all=corr[0][1:]
all_s=stats.describe(all)
all_m=all_s.mean

x=np.array([np.concatenate((corr[i][1:i],
                            corr[i][i+1:])) for i in range(1,N+1)])
# The above, although transposed, works because the correlation matrix
#  is symmetric
xd=[stats.describe(x[i]) for i in range(N)]
xs=stats.describe(np.array([xd[i].mean for i in range(N)]))
xm=xs.mean
xsd=np.sqrt(xs.variance)

x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)]

### I need to review rows, e.g. counts[0] is an array of N+1 counts
###   for the most common label in the complete crawl,
###   from the complete crawl and all the segments
### versus columns, e.g. counts[:,0] is an array of N decreasing counts
###   for all the labels in the complete crawl