Mercurial > hg > cc > cirrus_work
changeset 179:16d603447fbc
move to class with local vars instead of many globals
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 24 Nov 2023 20:41:03 +0000 |
parents | c42a5f7c97c5 |
children | a20cf98f3a77 |
files | lib/python/cc/stats.py |
diffstat | 1 files changed, 155 insertions(+), 156 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/stats.py Fri Nov 24 20:40:09 2023 +0000 +++ b/lib/python/cc/stats.py Fri Nov 24 20:41:03 2023 +0000 @@ -1,15 +1,15 @@ #!/usr/bin/env python3 -'''Rank correlation processing for a csv tabulation of counts by segment +'''Rank self.correlation processing for a csv tabulation of self.counts by segment First column is for whole crawl, then 100 columns for segs 0-99 - Each row is counts for some property, e.g. mime-detected or tld + Each row is self.counts for some property, e.g. mime-detected or tld - For example, assuming ALL.tsv has the whole-crawl warc-only counts - and s...tsv have the segment counts, ALL with counts in column 1, + For example, assuming ALL.tsv has the whole-crawl warc-only self.counts + and s...tsv have the segment self.counts, ALL with self.counts in column 1, tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv will produce such a file with - * 100 rows, one for each of the top 100 counts + * 100 rows, one for each of the top 100 self.counts * 101 columns, 0 for ALL and 1--100 for segs 0--99 Usage: python3 -i spearman.py name id @@ -25,168 +25,167 @@ import sys, math -def qqa(): - # q-q plot for the whole crawl - sm.qqplot(ALL, line='s') - plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id) - plt.show() +class CCStats: + def __init__(self, file, id): + self.file = file + self.id = id + self.counts=loadtxt(file+".csv",delimiter=',') + self.N=self.counts.shape[1]-1 + # "If axis=0 (default), then each column represents a variable, with + # observations in the rows" + # So each column is a sequence of self.counts, for whole crawl in column 0 + # and for segments 0--self.N-1 in columns 1--self.N + self.corr=stats.spearmanr(self.counts,nan_policy='omit').self.correlation + + self.all=self.corr[0][1:] + self.all_s=stats.describe(self.all) + self.all_m=self.all_s.mean -def qqs(): - # q-q plots for the best and worst (by variance) segments - global xv, xworst, xbest - xv=[d.variance for d in xd] - xworst=xv.index(max(xv)) - xbest=xv.index(min(xv)) - print(xbest,xworst) - sm.qqplot(x[xbest], line='s') - plt.gca().set_title('Best segment (least variance): %s'%xbest) - plt.show() - sm.qqplot(x[xworst], line='s') - plt.gca().set_title('Worst segment (most variance): %s'%xworst) - plt.show() + self.x=np.array([np.concatenate((self.corr[i][1:i], + self.corr[i][i+1:])) for i in range(1,self.N+1)]) + # The above, although transposed, works because the self.correlation matrix + # is symmetric + self.xd=[stats.describe(self.x[i]) for i in range(self.N)] + self.xs=stats.describe(np.array([self.xd[i].mean for i in range(self.N)])) + self.xm=self.xs.mean + self.xsd=np.sqrt(self.xs.variance) + + self.x_ranks=[stats.rankdata(-self.counts[:,i],method='average') for i in range(1,self.N+1)] + + self.aa=ranks() + self.aa_by_all=self.aa[self.aa[:,1].argsort()] + + def qqa(self): + # q-q plot for the whole crawl + sm.qqplot(self.all, line='s') + plt.gca().set_title('Rank self.correlation per segment wrt whole archive %s'%id) + plt.show() -def plot_x(sort=False,block=True,all_only=True,title=None): - # Make these two subplots, w. and w/o sorting - # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot - # for legend hacking - if sort: - aso=np.argsort(-ALL) - plot_all=ALL[aso] - plot_x=np.array([xd[i].mean for i in range(N)])[aso] - else: - plot_all=ALL - plot_x=[xd[i].mean for i in range(N)] - if title is None: - l1='Rank correlation of segment x whole crawl' - l2='Mean of segment x whole crawl' - plt.legend(loc='best',fontsize='small') - else: - l1=l2=None - plt.plot(plot_all,'rx',label=l1) - plt.plot([0,N-1],[all_m,all_m],'r',label=l2) - if not(all_only): - plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x ALL other segments') - plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') - plt.axis([0,N-1,0.85 if all_only else 0.8,1.0]) - plt.grid(True) - if title is not None: - plt.title(title) - plt.show(block=block) + def qqs(self): + # q-q plots for the best and worst (by variance) segments + global xv, xworst, xbest + xv=[d.variance for d in self.xd] + xworst=xv.index(max(xv)) + xbest=xv.index(min(xv)) + print(xbest,xworst) + sm.qqplot(self.x[xbest], line='s') + plt.gca().set_title('Best segment (least variance): %s'%xbest) + plt.show() + sm.qqplot(self.x[xworst], line='s') + plt.gca().set_title('Worst segment (most variance): %s'%xworst) + plt.show() -def hist_x(align='mid'): - hist(xm,xsd,[xd[i].mean for i in range(N)], - 'Mean of rank correlation of each segment x ALL other segments', - align) - -def hist_all(align='mid'): - hist(all_m,np.sqrt(all_s.variance),ALL, - 'Rank correlation of each segment x whole crawl %s'%id, - align) - -def hist(m,sd,hh,title,align): - sdd=[(i,m-(i*sd)) for i in range(-2,3)] - fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 - sdax=hax.twiny() - hax.hist(hh,color='lightblue',align=align) - hax.set_title(title) - for s,v in sdd: - sdax.plot([v,v],[0,18],'b') - sdax.set_xlim(hax.get_xlim()) - sdax.set_ylim(hax.get_ylim()) - sdax.set_xticks([v for s,v in sdd]) - sdax.set_xticklabels([str(s) for s,v in sdd]) - plt.show() + def plot_x(sort=False,block=True,all_only=True,title=None): + # Make these two subplots, w. and w/o sorting + # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot + # for legend hacking + if sort: + aso=np.argsort(-self.all) + plot_all=self.all[aso] + plot_x=np.array([self.xd[i].mean for i in range(self.N)])[aso] + else: + plot_all=self.all + plot_x=[self.xd[i].mean for i in range(self.N)] + if title is None: + l1='Rank self.correlation of segment self.x whole crawl' + l2='Mean of segment self.x whole crawl' + plt.legend(loc='best',fontsize='small') + else: + l1=l2=None + plt.plot(plot_all,'rx',label=l1) + plt.plot([0,self.N-1],[self.all_m,self.all_m],'r',label=l2) + if not(all_only): + plt.plot(plot_x,'bx',label='Mean of rank self.correlation of each segment self.x self.all other segments') + plt.plot([0,self.N-1],[self.xm,self.xm],'b',label='Mean of segment self.x segment means') + plt.axis([0,self.N-1,0.85 if all_only else 0.8,1.0]) + plt.grid(True) + if title is not None: + plt.title(title) + plt.show(block=block) -def ci(rho,n,conf=0.95): - # Courtesy of https://stats.stackexchange.com/a/18904 - # rho is (rank) correlation, n is sample size - stderr=1.0/math.sqrt(n-3) - z=stats.norm.ppf(1.0-((1.0-conf)/2)) - delta=z*stderr - lower=math.tanh(math.atanh(rho)-delta) - upper=math.tanh(math.atanh(rho)+delta) - return (lower,upper) + def hist_x(align='mid'): + hist(self.xm,self.xsd,[self.xd[i].mean for i in range(self.N)], + 'Mean of rank self.correlation of each segment self.x self.all other segments', + align) + + def hist_all(align='mid'): + hist(self.all_m,np.sqrt(self.all_s.variance),self.all, + 'Rank self.correlation of each segment self.x whole crawl %s'%id, + align) -def plot_ci(rhos,n,trim=None,conf=0.95): - # rhos are (rank) correlation values - rhos_s=rhos[(-rhos).argsort()] - if trim is None: - l=len(rhos) - else: - rhos_s=rhos_s[:trim] - l=trim - cc=(np.array([ci(r,n,conf) for r in rhos_s])).T - ue=cc[1]-rhos_s - le=rhos_s-cc[0] - #for i in range(len(rhos)): - #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) - plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') - plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) - plt.show() + def hist(m,sd,hh,title,align): + sdd=[(i,m-(i*sd)) for i in range(-2,3)] + fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 + sdax=hax.twiny() + hax.hist(hh,color='lightblue',align=align) + hax.set_title(title) + for s,v in sdd: + sdax.plot([v,v],[0,18],'b') + sdax.set_xlim(hax.get_xlim()) + sdax.set_ylim(hax.get_ylim()) + sdax.set_xticks([v for s,v in sdd]) + sdax.set_xticklabels([str(s) for s,v in sdd]) + plt.show() -def first_diff(ranks): - # first disagreement with baseline == {1,2,...} - for i in range(len(ranks)): - if ranks[i]!=i+1.0: - return i - return i+1 + def ci(rho,n,conf=0.95): + # Courtesy of https://stats.stackexchange.com/a/18904 + # rho is (rank) self.correlation, n is sample size + stderr=1.0/math.sqrt(n-3) + z=stats.norm.ppf(1.0-((1.0-conf)/2)) + delta=z*stderr + lower=math.tanh(math.atanh(rho)-delta) + upper=math.tanh(math.atanh(rho)+delta) + return (lower,upper) -def ranks(): - # Combine segment measures: - # segID,rank corr. wrt ALL,inverse variance, mean cross rank corr.,first disagreement - # convert to ranks, smallest value == highest rank - all_ranked=stats.rankdata(-ALL,method='average') # invert since - # large corr is good - x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)]) - # small corr variance is good - x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)]) - # invert since - # large mean corr is good - fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)]) - # invert since - # large first diff is good - return np.array([[i, - all_ranked[i], - x_variance_ranked[i], - x_mean_ranked[i], - fd_ranked[i]] for i in range(N)]) + def plot_ci(rhos,n,trim=None,conf=0.95): + # rhos are (rank) self.correlation values + rhos_s=rhos[(-rhos).argsort()] + if trim is None: + l=len(rhos) + else: + rhos_s=rhos_s[:trim] + l=trim + cc=(np.array([ci(r,n,conf) for r in rhos_s])).T + ue=cc[1]-rhos_s + le=rhos_s-cc[0] + #for i in range(len(rhos)): + #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) + plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') + plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) + plt.show() -def main(file,id): - # this should be a class with all these globals as instance vars - global counts, corr, ALL, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr - global aa, aa_by_all, N - counts=loadtxt(file+".csv",delimiter=',') - N=counts.shape[1]-1 - # "If axis=0 (default), then each column represents a variable, with - # observations in the rows" - # So each column is a sequence of counts, for whole crawl in column 0 - # and for segments 0--N-1 in columns 1--N - corr=stats.spearmanr(counts,nan_policy='omit').correlation + def first_diff(ranks): + # first disagreement with baseline == {1,2,...} + for i in range(len(ranks)): + if ranks[i]!=i+1.0: + return i + return i+1 - ALL=corr[0][1:] - all_s=stats.describe(ALL) - all_m=all_s.mean - - x=np.array([np.concatenate((corr[i][1:i], - corr[i][i+1:])) for i in range(1,N+1)]) - # The above, although transposed, works because the correlation matrix - # is symmetric - xd=[stats.describe(x[i]) for i in range(N)] - xs=stats.describe(np.array([xd[i].mean for i in range(N)])) - xm=xs.mean - xsd=np.sqrt(xs.variance) - - x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)] - - aa=ranks() - aa_by_all=aa[aa[:,1].argsort()] + def ranks(): + # Combine segment measures: + # segID,rank self.corr. wrt self.all,inverse variance, mean cross rank self.corr.,first disagreement + # convert to ranks, smallest value == highest rank + all_ranked=stats.rankdata(-self.all,method='average') # invert since + # large self.corr is good + x_variance_ranked=stats.rankdata([self.xd[i].variance for i in range(self.N)]) + # small self.corr variance is good + x_mean_ranked=stats.rankdata([-(self.xd[i].mean) for i in range(self.N)]) + # invert since + # large mean self.corr is good + fd_ranked=stats.rankdata([-first_diff(self.x_ranks[i]) for i in range(self.N)]) + # invert since + # large first diff is good + return np.array([[i, + all_ranked[i], + x_variance_ranked[i], + x_mean_ranked[i], + fd_ranked[i]] for i in range(self.N)]) if __name__ == '__main__': - sys.exit(main(*sys.argv[1:])) - + S = CCStats(*sys.argv[1:]) -### I need to review rows, e.g. counts[0] is an array of N+1 counts +### I need to review rows, e.g. self.counts[0] is an array of self.N+1 self.counts ### for the most common label in the complete crawl, -### from the complete crawl and ALL the segments -### versus columns, e.g. counts[:,0] is an array of N decreasing counts +### from the complete crawl and all the segments +### versus columns, e.g. self.counts[:,0] is an array of self.N decreasing self.counts ### for all the labels in the complete crawl