Mercurial > hg > cc > cirrus_work
changeset 177:a5d54736a77f
renamed from spearman.py
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 24 Nov 2023 20:39:08 +0000 |
parents | 08e903ed5a98 |
children | c42a5f7c97c5 |
files | lib/python/cc/stats.py |
diffstat | 1 files changed, 192 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/stats.py Fri Nov 24 20:39:08 2023 +0000 @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +'''Rank correlation processing for a csv tabulation of counts by segment + First column is for whole crawl, then 100 columns for segs 0-99 + Each row is counts for some property, e.g. mime-detected or tld + + For example, assuming ALL.tsv has the whole-crawl warc-only counts + and s...tsv have the segment counts, ALL with counts in column 1, + + tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv + + will produce such a file with + * 100 rows, one for each of the top 100 counts + * 101 columns, 0 for ALL and 1--100 for segs 0--99 + + Usage: python3 -i spearman.py name id + where name.csv has the input +''' + +import numpy as np +from numpy import loadtxt +from scipy import stats +import statsmodels.api as sm +import matplotlib.pyplot as plt +import pylab + +import sys, math + +def qqa(): + # q-q plot for the whole crawl + sm.qqplot(ALL, line='s') + plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id) + plt.show() + +def qqs(): + # q-q plots for the best and worst (by variance) segments + global xv, xworst, xbest + xv=[d.variance for d in xd] + xworst=xv.index(max(xv)) + xbest=xv.index(min(xv)) + print(xbest,xworst) + sm.qqplot(x[xbest], line='s') + plt.gca().set_title('Best segment (least variance): %s'%xbest) + plt.show() + sm.qqplot(x[xworst], line='s') + plt.gca().set_title('Worst segment (most variance): %s'%xworst) + plt.show() + +def plot_x(sort=False,block=True,all_only=True,title=None): + # Make these two subplots, w. and w/o sorting + # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot + # for legend hacking + if sort: + aso=np.argsort(-ALL) + plot_all=ALL[aso] + plot_x=np.array([xd[i].mean for i in range(N)])[aso] + else: + plot_all=ALL + plot_x=[xd[i].mean for i in range(N)] + if title is None: + l1='Rank correlation of segment x whole crawl' + l2='Mean of segment x whole crawl' + plt.legend(loc='best',fontsize='small') + else: + l1=l2=None + plt.plot(plot_all,'rx',label=l1) + plt.plot([0,N-1],[all_m,all_m],'r',label=l2) + if not(all_only): + plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x ALL other segments') + plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') + plt.axis([0,N-1,0.85 if all_only else 0.8,1.0]) + plt.grid(True) + if title is not None: + plt.title(title) + plt.show(block=block) + +def hist_x(align='mid'): + hist(xm,xsd,[xd[i].mean for i in range(N)], + 'Mean of rank correlation of each segment x ALL other segments', + align) + +def hist_all(align='mid'): + hist(all_m,np.sqrt(all_s.variance),ALL, + 'Rank correlation of each segment x whole crawl %s'%id, + align) + +def hist(m,sd,hh,title,align): + sdd=[(i,m-(i*sd)) for i in range(-2,3)] + fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 + sdax=hax.twiny() + hax.hist(hh,color='lightblue',align=align) + hax.set_title(title) + for s,v in sdd: + sdax.plot([v,v],[0,18],'b') + sdax.set_xlim(hax.get_xlim()) + sdax.set_ylim(hax.get_ylim()) + sdax.set_xticks([v for s,v in sdd]) + sdax.set_xticklabels([str(s) for s,v in sdd]) + plt.show() + +def ci(rho,n,conf=0.95): + # Courtesy of https://stats.stackexchange.com/a/18904 + # rho is (rank) correlation, n is sample size + stderr=1.0/math.sqrt(n-3) + z=stats.norm.ppf(1.0-((1.0-conf)/2)) + delta=z*stderr + lower=math.tanh(math.atanh(rho)-delta) + upper=math.tanh(math.atanh(rho)+delta) + return (lower,upper) + +def plot_ci(rhos,n,trim=None,conf=0.95): + # rhos are (rank) correlation values + rhos_s=rhos[(-rhos).argsort()] + if trim is None: + l=len(rhos) + else: + rhos_s=rhos_s[:trim] + l=trim + cc=(np.array([ci(r,n,conf) for r in rhos_s])).T + ue=cc[1]-rhos_s + le=rhos_s-cc[0] + #for i in range(len(rhos)): + #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) + plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') + plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) + plt.show() + +def first_diff(ranks): + # first disagreement with baseline == {1,2,...} + for i in range(len(ranks)): + if ranks[i]!=i+1.0: + return i + return i+1 + +def ranks(): + # Combine segment measures: + # segID,rank corr. wrt ALL,inverse variance, mean cross rank corr.,first disagreement + # convert to ranks, smallest value == highest rank + all_ranked=stats.rankdata(-ALL,method='average') # invert since + # large corr is good + x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)]) + # small corr variance is good + x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)]) + # invert since + # large mean corr is good + fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)]) + # invert since + # large first diff is good + return np.array([[i, + all_ranked[i], + x_variance_ranked[i], + x_mean_ranked[i], + fd_ranked[i]] for i in range(N)]) + +def main(file,id): + # this should be a class with all these globals as instance vars + global counts, corr, ALL, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr + global aa, aa_by_all, N + counts=loadtxt(file+".csv",delimiter=',') + N=counts.shape[1]-1 + # "If axis=0 (default), then each column represents a variable, with + # observations in the rows" + # So each column is a sequence of counts, for whole crawl in column 0 + # and for segments 0--N-1 in columns 1--N + corr=stats.spearmanr(counts,nan_policy='omit').correlation + + ALL=corr[0][1:] + all_s=stats.describe(ALL) + all_m=all_s.mean + + x=np.array([np.concatenate((corr[i][1:i], + corr[i][i+1:])) for i in range(1,N+1)]) + # The above, although transposed, works because the correlation matrix + # is symmetric + xd=[stats.describe(x[i]) for i in range(N)] + xs=stats.describe(np.array([xd[i].mean for i in range(N)])) + xm=xs.mean + xsd=np.sqrt(xs.variance) + + x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)] + + aa=ranks() + aa_by_all=aa[aa[:,1].argsort()] + +if __name__ == '__main__': + sys.exit(main(*sys.argv[1:])) + + +### I need to review rows, e.g. counts[0] is an array of N+1 counts +### for the most common label in the complete crawl, +### from the complete crawl and ALL the segments +### versus columns, e.g. counts[:,0] is an array of N decreasing counts +### for all the labels in the complete crawl