Mercurial > hg > cc > cirrus_work
changeset 176:08e903ed5a98
renamed to stats.py
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 24 Nov 2023 20:38:39 +0000 |
parents | 684370d1bf22 |
children | a5d54736a77f |
files | lib/python/cc/spearman.py |
diffstat | 1 files changed, 0 insertions(+), 192 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/spearman.py Fri Nov 24 19:52:52 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,192 +0,0 @@ -#!/usr/bin/env python3 -'''Rank correlation processing for a csv tabulation of counts by segment - First column is for whole crawl, then 100 columns for segs 0-99 - Each row is counts for some property, e.g. mime-detected or tld - - For example, assuming ALL.tsv has the whole-crawl warc-only counts - and s...tsv have the segment counts, ALL with counts in column 1, - - tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv - - will produce such a file with - * 100 rows, one for each of the top 100 counts - * 101 columns, 0 for ALL and 1--100 for segs 0--99 - - Usage: python3 -i spearman.py name id - where name.csv has the input -''' - -import numpy as np -from numpy import loadtxt -from scipy import stats -import statsmodels.api as sm -import matplotlib.pyplot as plt -import pylab - -import sys, math - -def qqa(): - # q-q plot for the whole crawl - sm.qqplot(ALL, line='s') - plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id) - plt.show() - -def qqs(): - # q-q plots for the best and worst (by variance) segments - global xv, xworst, xbest - xv=[d.variance for d in xd] - xworst=xv.index(max(xv)) - xbest=xv.index(min(xv)) - print(xbest,xworst) - sm.qqplot(x[xbest], line='s') - plt.gca().set_title('Best segment (least variance): %s'%xbest) - plt.show() - sm.qqplot(x[xworst], line='s') - plt.gca().set_title('Worst segment (most variance): %s'%xworst) - plt.show() - -def plot_x(sort=False,block=True,all_only=True,title=None): - # Make these two subplots, w. and w/o sorting - # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot - # for legend hacking - if sort: - aso=np.argsort(-ALL) - plot_all=ALL[aso] - plot_x=np.array([xd[i].mean for i in range(N)])[aso] - else: - plot_all=ALL - plot_x=[xd[i].mean for i in range(N)] - if title is None: - l1='Rank correlation of segment x whole crawl' - l2='Mean of segment x whole crawl' - plt.legend(loc='best',fontsize='small') - else: - l1=l2=None - plt.plot(plot_all,'rx',label=l1) - plt.plot([0,N-1],[all_m,all_m],'r',label=l2) - if not(all_only): - plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x ALL other segments') - plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') - plt.axis([0,N-1,0.85 if all_only else 0.8,1.0]) - plt.grid(True) - if title is not None: - plt.title(title) - plt.show(block=block) - -def hist_x(align='mid'): - hist(xm,xsd,[xd[i].mean for i in range(N)], - 'Mean of rank correlation of each segment x ALL other segments', - align) - -def hist_all(align='mid'): - hist(all_m,np.sqrt(all_s.variance),ALL, - 'Rank correlation of each segment x whole crawl %s'%id, - align) - -def hist(m,sd,hh,title,align): - sdd=[(i,m-(i*sd)) for i in range(-2,3)] - fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 - sdax=hax.twiny() - hax.hist(hh,color='lightblue',align=align) - hax.set_title(title) - for s,v in sdd: - sdax.plot([v,v],[0,18],'b') - sdax.set_xlim(hax.get_xlim()) - sdax.set_ylim(hax.get_ylim()) - sdax.set_xticks([v for s,v in sdd]) - sdax.set_xticklabels([str(s) for s,v in sdd]) - plt.show() - -def ci(rho,n,conf=0.95): - # Courtesy of https://stats.stackexchange.com/a/18904 - # rho is (rank) correlation, n is sample size - stderr=1.0/math.sqrt(n-3) - z=stats.norm.ppf(1.0-((1.0-conf)/2)) - delta=z*stderr - lower=math.tanh(math.atanh(rho)-delta) - upper=math.tanh(math.atanh(rho)+delta) - return (lower,upper) - -def plot_ci(rhos,n,trim=None,conf=0.95): - # rhos are (rank) correlation values - rhos_s=rhos[(-rhos).argsort()] - if trim is None: - l=len(rhos) - else: - rhos_s=rhos_s[:trim] - l=trim - cc=(np.array([ci(r,n,conf) for r in rhos_s])).T - ue=cc[1]-rhos_s - le=rhos_s-cc[0] - #for i in range(len(rhos)): - #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) - plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') - plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) - plt.show() - -def first_diff(ranks): - # first disagreement with baseline == {1,2,...} - for i in range(len(ranks)): - if ranks[i]!=i+1.0: - return i - return i+1 - -def ranks(): - # Combine segment measures: - # segID,rank corr. wrt ALL,inverse variance, mean cross rank corr.,first disagreement - # convert to ranks, smallest value == highest rank - all_ranked=stats.rankdata(-ALL,method='average') # invert since - # large corr is good - x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)]) - # small corr variance is good - x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)]) - # invert since - # large mean corr is good - fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)]) - # invert since - # large first diff is good - return np.array([[i, - all_ranked[i], - x_variance_ranked[i], - x_mean_ranked[i], - fd_ranked[i]] for i in range(N)]) - -def main(file,id): - # this should be a class with all these globals as instance vars - global counts, corr, ALL, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr - global aa, aa_by_all, N - counts=loadtxt(file+".csv",delimiter=',') - N=counts.shape[1]-1 - # "If axis=0 (default), then each column represents a variable, with - # observations in the rows" - # So each column is a sequence of counts, for whole crawl in column 0 - # and for segments 0--N-1 in columns 1--N - corr=stats.spearmanr(counts,nan_policy='omit').correlation - - ALL=corr[0][1:] - all_s=stats.describe(ALL) - all_m=all_s.mean - - x=np.array([np.concatenate((corr[i][1:i], - corr[i][i+1:])) for i in range(1,N+1)]) - # The above, although transposed, works because the correlation matrix - # is symmetric - xd=[stats.describe(x[i]) for i in range(N)] - xs=stats.describe(np.array([xd[i].mean for i in range(N)])) - xm=xs.mean - xsd=np.sqrt(xs.variance) - - x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)] - - aa=ranks() - aa_by_all=aa[aa[:,1].argsort()] - -if __name__ == '__main__': - sys.exit(main(*sys.argv[1:])) - - -### I need to review rows, e.g. counts[0] is an array of N+1 counts -### for the most common label in the complete crawl, -### from the complete crawl and ALL the segments -### versus columns, e.g. counts[:,0] is an array of N decreasing counts -### for all the labels in the complete crawl