Mercurial > hg > cc > cirrus_work
view bin/spearman.py @ 29:669a0b120d34
start work on ranking,
lose faith in getting row vs. column correct every time
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 16 Nov 2022 19:52:50 +0000 |
parents | 21da4d6521db |
children | c73ec9deabbe |
line wrap: on
line source
#!/usr/bin/env python3 '''Rank correlation processing for a csv tabulation of counts by segment First column is for whole crawl, then 100 columns for segs 0-99 Each row is counts for some property, e.g. mime-detected or tld For example tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv will produce such a file with 100 rows assuming all.tsv has the whole-crawl warc-only counts and s...tsv have the segment counts, all counts in column 1 Usage: python3 -i spearman.py name where name.csv has the input ''' import numpy as np from numpy import loadtxt from scipy import stats import statsmodels.api as sm import matplotlib.pyplot as plt import pylab import sys def qqa(): # q-q plot for the whole crawl sm.qqplot(all, line='s') plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)') plt.show() def qqs(): # q-q plots for the best and worst (by variance) segments global xv, xworst, xbest xv=[d.variance for d in xd] xworst=xv.index(max(xv)) xbest=xv.index(min(xv)) print(xbest,xworst) sm.qqplot(x[xbest], line='s') plt.gca().set_title('Best segment (least variance): %s'%xbest) plt.show() sm.qqplot(x[xworst], line='s') plt.gca().set_title('Worst segment (most variance): %s'%xworst) plt.show() def plot_x(): plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments') plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means') plt.plot(all,'rx',label='Rank correlation of segment x whole crawl') plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl') plt.axis([0,99,0.8,1.0]) plt.legend(loc='best') plt.grid(True) plt.show() def hist(): sdd=[(i,xm-(i*xsd)) for i in range(-2,3)] fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 sdax=hax.twiny() hax.hist([xd[i].mean for i in range(100)],color='lightblue') hax.set_title('Mean of rank correlation of each segment x all other segments') for s,v in sdd: sdax.plot([v,v],[0,18],'b') sdax.set_xlim(hax.get_xlim()) sdax.set_ylim(hax.get_ylim()) sdax.set_xticks([v for s,v in sdd]) sdax.set_xticklabels([str(s) for s,v in sdd]) plt.show() def first_diff(ranks): # first disagreement with baseline == {1,2,...} for i in range(len(ranks)): if ranks[i]!=i+1.0: return i return i+1 def ranks(): # Combine segment measures: # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement return np.array([i,all[i],1.0/xd[i].variance,xd[i].mean,first_diff(ranks[i])]) counts=loadtxt(sys.argv[1]+".csv",delimiter=',') # "If axis=0 (default), then each column represents a variable, with # observations in the rows" ranks=[stats.rankdata(-counts[i],method='average') for for i in range(1,100)] corr=stats.spearmanr(counts,nan_policy='omit').correlation all=corr[0][1:] all_s=stats.describe(all) all_m=all_s.mean x=np.array([np.concatenate((corr[i][1:i], corr[i][i+1:])) for i in range(1,101)]) xd=[stats.describe(x[i]) for i in range(100)] xs=stats.describe(np.array([xd[i].mean for i in range(100)])) xm=xs.mean xsd=np.sqrt(xs.variance) ### I need to review rows, e.g. counts[0] is an array of 101 counts ### for the most common label in the complete crawl, ### from the complete crawl and all the segments ### versus columns, e.g. counts[:,0] is an array of 100 decreasing counts ### for all the labels in the complete crawl