Mercurial > hg > cc > cirrus_work
view bin/spearman.py @ 111:ab3d547f3e76
one uncommited fix from quentin
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 22 Sep 2023 15:27:28 +0100 |
parents | da1cbcd8acee |
children |
line wrap: on
line source
#!/usr/bin/env python3 '''Rank correlation processing for a csv tabulation of counts by segment First column is for whole crawl, then 100 columns for segs 0-99 Each row is counts for some property, e.g. mime-detected or tld For example, assuming all.tsv has the whole-crawl warc-only counts and s...tsv have the segment counts, all with counts in column 1, tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv will produce such a file with * 100 rows, one for each of the top 100 counts * 101 columns, 0 for all and 1--100 for segs 0--99 Usage: python3 -i spearman.py name id where name.csv has the input ''' import numpy as np from numpy import loadtxt from scipy import stats import statsmodels.api as sm import matplotlib.pyplot as plt import pylab import sys, math def qqa(): # q-q plot for the whole crawl sm.qqplot(all, line='s') plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id) plt.show() def qqs(): # q-q plots for the best and worst (by variance) segments global xv, xworst, xbest xv=[d.variance for d in xd] xworst=xv.index(max(xv)) xbest=xv.index(min(xv)) print(xbest,xworst) sm.qqplot(x[xbest], line='s') plt.gca().set_title('Best segment (least variance): %s'%xbest) plt.show() sm.qqplot(x[xworst], line='s') plt.gca().set_title('Worst segment (most variance): %s'%xworst) plt.show() def plot_x(sort=False,block=True,all_only=True,title=None): # Make these two subplots, w. and w/o sorting # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot # for legend hacking if sort: aso=np.argsort(-all) plot_all=all[aso] plot_x=np.array([xd[i].mean for i in range(N)])[aso] else: plot_all=all plot_x=[xd[i].mean for i in range(N)] if title is None: l1='Rank correlation of segment x whole crawl' l2='Mean of segment x whole crawl' plt.legend(loc='best',fontsize='small') else: l1=l2=None plt.plot(plot_all,'rx',label=l1) plt.plot([0,N-1],[all_m,all_m],'r',label=l2) if not(all_only): plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') plt.axis([0,N-1,0.85 if all_only else 0.8,1.0]) plt.grid(True) if title is not None: plt.title(title) plt.show(block=block) def hist_x(align='mid'): hist(xm,xsd,[xd[i].mean for i in range(N)], 'Mean of rank correlation of each segment x all other segments', align) def hist_all(align='mid'): hist(all_m,np.sqrt(all_s.variance),all, 'Rank correlation of each segment x whole crawl %s'%id, align) def hist(m,sd,hh,title,align): sdd=[(i,m-(i*sd)) for i in range(-2,3)] fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 sdax=hax.twiny() hax.hist(hh,color='lightblue',align=align) hax.set_title(title) for s,v in sdd: sdax.plot([v,v],[0,18],'b') sdax.set_xlim(hax.get_xlim()) sdax.set_ylim(hax.get_ylim()) sdax.set_xticks([v for s,v in sdd]) sdax.set_xticklabels([str(s) for s,v in sdd]) plt.show() def ci(rho,n,conf=0.95): # Courtesy of https://stats.stackexchange.com/a/18904 # rho is (rank) correlation, n is sample size stderr=1.0/math.sqrt(n-3) z=stats.norm.ppf(1.0-((1.0-conf)/2)) delta=z*stderr lower=math.tanh(math.atanh(rho)-delta) upper=math.tanh(math.atanh(rho)+delta) return (lower,upper) def plot_ci(rhos,n,trim=None,conf=0.95): # rhos are (rank) correlation values rhos_s=rhos[(-rhos).argsort()] if trim is None: l=len(rhos) else: rhos_s=rhos_s[:trim] l=trim cc=(np.array([ci(r,n,conf) for r in rhos_s])).T ue=cc[1]-rhos_s le=rhos_s-cc[0] #for i in range(len(rhos)): #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) plt.show() def first_diff(ranks): # first disagreement with baseline == {1,2,...} for i in range(len(ranks)): if ranks[i]!=i+1.0: return i return i+1 def ranks(): # Combine segment measures: # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement # convert to ranks, smallest value == highest rank all_ranked=stats.rankdata(-all,method='average') # invert since # large corr is good x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)]) # small corr variance is good x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)]) # invert since # large mean corr is good fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)]) # invert since # large first diff is good return np.array([[i, all_ranked[i], x_variance_ranked[i], x_mean_ranked[i], fd_ranked[i]] for i in range(N)]) def main(): global counts, id, corr, all, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr global aa, aa_by_all, N counts=loadtxt(sys.argv[1]+".csv",delimiter=',') id=sys.argv[2] N=counts.shape[1]-1 # "If axis=0 (default), then each column represents a variable, with # observations in the rows" # So each column is a sequence of counts, for whole crawl in column 0 # and for segments 0--N-1 in columns 1--N corr=stats.spearmanr(counts,nan_policy='omit').correlation all=corr[0][1:] all_s=stats.describe(all) all_m=all_s.mean x=np.array([np.concatenate((corr[i][1:i], corr[i][i+1:])) for i in range(1,N+1)]) # The above, although transposed, works because the correlation matrix # is symmetric xd=[stats.describe(x[i]) for i in range(N)] xs=stats.describe(np.array([xd[i].mean for i in range(N)])) xm=xs.mean xsd=np.sqrt(xs.variance) x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)] aa=ranks() aa_by_all=aa[aa[:,1].argsort()] ### I need to review rows, e.g. counts[0] is an array of N+1 counts ### for the most common label in the complete crawl, ### from the complete crawl and all the segments ### versus columns, e.g. counts[:,0] is an array of N decreasing counts ### for all the labels in the complete crawl