Mercurial > hg > cc > cirrus_work
changeset 180:a20cf98f3a77
rename to avoid name clash with scipy.stats
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sun, 26 Nov 2023 21:24:38 +0000 |
parents | 16d603447fbc |
children | 0058e523f649 |
files | lib/python/cc/ccstats.py lib/python/cc/stats.py |
diffstat | 2 files changed, 192 insertions(+), 191 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/ccstats.py Sun Nov 26 21:24:38 2023 +0000 @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +'''Rank self.correlation processing for a csv tabulation of self.counts by segment + First column is for whole crawl, then 100 columns for segs 0-99 + Each row is self.counts for some property, e.g. mime-detected or tld + + For example, assuming ALL.tsv has the whole-crawl warc-only self.counts + and s...tsv have the segment self.counts, ALL with self.counts in column 1, + + tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv + + will produce such a file with + * 100 rows, one for each of the top 100 self.counts + * 101 columns, 0 for ALL and 1--100 for segs 0--99 + + Usage: python3 -i spearman.py name id + where name.csv has the input +''' + +import numpy as np +from numpy import loadtxt +from scipy import stats +import statsmodels.api as sm +import matplotlib.pyplot as plt +import pylab + +import sys, math + +class CCStats: + def __init__(self, file, id): + self.file = file + self.id = id + self.counts=loadtxt(file+".csv",delimiter=',') + self.N=self.counts.shape[1]-1 + # "If axis=0 (default), then each column represents a variable, with + # observations in the rows" + # So each column is a sequence of self.counts, for whole crawl in column 0 + # and for segments 0--self.N-1 in columns 1--self.N + self.corr=stats.spearmanr(self.counts,nan_policy='omit').correlation + + self.all=self.corr[0][1:] + self.all_s=stats.describe(self.all) + self.all_m=self.all_s.mean + + self.x=np.array([np.concatenate((self.corr[i][1:i], + self.corr[i][i+1:])) for i in range(1,self.N+1)]) + # The above, although transposed, works because the correlation matrix + # is symmetric + self.xd=[stats.describe(self.x[i]) for i in range(self.N)] + self.xs=stats.describe(np.array([self.xd[i].mean for i in range(self.N)])) + self.xm=self.xs.mean + self.xsd=np.sqrt(self.xs.variance) + + self.x_ranks=[stats.rankdata(-self.counts[:,i],method='average') for i in range(1,self.N+1)] + + self.aa = self.ranks() + self.aa_by_all=self.aa[self.aa[:,1].argsort()] + self.iranks_by_all = [int(r) for r in self.aa_by_all[:,0]] + + def qqa(self): + # q-q plot for the whole crawl + sm.qqplot(self.all, line='s') + plt.gca().set_title('Rank self.correlation per segment wrt whole archive %s'%id) + plt.show() + + def qqs(self): + # q-q plots for the best and worst (by variance) segments + global xv, xworst, xbest + xv=[d.variance for d in self.xd] + xworst=xv.index(max(xv)) + xbest=xv.index(min(xv)) + print(xbest,xworst) + sm.qqplot(self.x[xbest], line='s') + plt.gca().set_title('Best segment (least variance): %s'%xbest) + plt.show() + sm.qqplot(self.x[xworst], line='s') + plt.gca().set_title('Worst segment (most variance): %s'%xworst) + plt.show() + + def plot_x(self,sort=False,block=True,all_only=True,title=None): + # Make these two subplots, w. and w/o sorting + # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot + # for legend hacking + if sort: + aso=np.argsort(-self.all) + plot_all=self.all[aso] + plot_x=np.array([self.xd[i].mean for i in range(self.N)])[aso] + else: + plot_all=self.all + plot_x=[self.xd[i].mean for i in range(self.N)] + if title is None: + l1='Rank self.correlation of segment self.x whole crawl' + l2='Mean of segment self.x whole crawl' + plt.legend(loc='best',fontsize='small') + else: + l1=l2=None + plt.plot(plot_all,'rx',label=l1) + plt.plot([0,self.N-1],[self.all_m,self.all_m],'r',label=l2) + if not(all_only): + plt.plot(plot_x,'bx',label='Mean of rank self.correlation of each segment self.x self.all other segments') + plt.plot([0,self.N-1],[self.xm,self.xm],'b',label='Mean of segment self.x segment means') + plt.axis([0,self.N-1,0.85 if all_only else 0.8,1.0]) + plt.grid(True) + if title is not None: + plt.title(title) + plt.show(block=block) + + def hist_x(self,align='mid'): + self.hist(self.xm,self.xsd,[self.xd[i].mean for i in range(self.N)], + 'Mean of rank self.correlation of each segment self.x self.all other segments', + align) + + def hist_all(self,align='mid'): + self.hist(self.all_m,np.sqrt(self.all_s.variance),self.all, + 'Rank self.correlation of each segment self.x whole crawl %s'%id, + align) + + def plot_ci(self,rhos,n,trim=None,conf=0.95): + # rhos are (rank) correlation values + rhos_s=rhos[(-rhos).argsort()] + if trim is None: + l=len(rhos) + else: + rhos_s=rhos_s[:trim] + l=trim + cc=(np.array([self.ci(r,n,conf) for r in rhos_s])).T + ue=cc[1]-rhos_s + le=rhos_s-cc[0] + #for i in range(len(rhos)): + #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) + plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') + plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) + plt.show() + + def ranks(self): + # Combine segment measures: + # segID,rank self.corr. wrt self.all,inverse variance, mean cross rank self.corr.,first disagreement + # convert to ranks, smallest value == highest rank + all_ranked=stats.rankdata(-self.all,method='average') # invert since + # large self.corr is good + x_variance_ranked=stats.rankdata([self.xd[i].variance for i in range(self.N)]) + # small self.corr variance is good + x_mean_ranked=stats.rankdata([-(self.xd[i].mean) for i in range(self.N)]) + # invert since + # large mean self.corr is good + fd_ranked=stats.rankdata([-first_diff(self.x_ranks[i]) for i in range(self.N)]) + # invert since + # large first diff is good + return np.array([[i, + all_ranked[i], + x_variance_ranked[i], + x_mean_ranked[i], + fd_ranked[i]] for i in range(self.N)]) + + def hist(m,sd,hh,title,align): + sdd=[(i,m-(i*sd)) for i in range(-2,3)] + fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 + sdax=hax.twiny() + hax.hist(hh,color='lightblue',align=align) + hax.set_title(title) + for s,v in sdd: + sdax.plot([v,v],[0,18],'b') + sdax.set_xlim(hax.get_xlim()) + sdax.set_ylim(hax.get_ylim()) + sdax.set_xticks([v for s,v in sdd]) + sdax.set_xticklabels([str(s) for s,v in sdd]) + plt.show() + +def first_diff(ranks): + # first disagreement with baseline == {1,2,...} + for i in range(len(ranks)): + if ranks[i]!=i+1.0: + return i + return i+1 + +def ci(rho,n,conf=0.95): + # Courtesy of https://stats.stackexchange.com/a/18904 + # rho is (rank) correlation, n is sample size + stderr=1.0/math.sqrt(n-3) + z=stats.norm.ppf(1.0-((1.0-conf)/2)) + delta=z*stderr + lower=math.tanh(math.atanh(rho)-delta) + upper=math.tanh(math.atanh(rho)+delta) + return (lower,upper) + +if __name__ == '__main__': + S = CCStats(*sys.argv[1:]) + +### I need to review rows, e.g. self.counts[0] is an array of self.N+1 self.counts +### for the most common label in the complete crawl, +### from the complete crawl and all the segments +### versus columns, e.g. self.counts[:,0] is an array of self.N decreasing self.counts +### for all the labels in the complete crawl
--- a/lib/python/cc/stats.py Fri Nov 24 20:41:03 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,191 +0,0 @@ -#!/usr/bin/env python3 -'''Rank self.correlation processing for a csv tabulation of self.counts by segment - First column is for whole crawl, then 100 columns for segs 0-99 - Each row is self.counts for some property, e.g. mime-detected or tld - - For example, assuming ALL.tsv has the whole-crawl warc-only self.counts - and s...tsv have the segment self.counts, ALL with self.counts in column 1, - - tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv - - will produce such a file with - * 100 rows, one for each of the top 100 self.counts - * 101 columns, 0 for ALL and 1--100 for segs 0--99 - - Usage: python3 -i spearman.py name id - where name.csv has the input -''' - -import numpy as np -from numpy import loadtxt -from scipy import stats -import statsmodels.api as sm -import matplotlib.pyplot as plt -import pylab - -import sys, math - -class CCStats: - def __init__(self, file, id): - self.file = file - self.id = id - self.counts=loadtxt(file+".csv",delimiter=',') - self.N=self.counts.shape[1]-1 - # "If axis=0 (default), then each column represents a variable, with - # observations in the rows" - # So each column is a sequence of self.counts, for whole crawl in column 0 - # and for segments 0--self.N-1 in columns 1--self.N - self.corr=stats.spearmanr(self.counts,nan_policy='omit').self.correlation - - self.all=self.corr[0][1:] - self.all_s=stats.describe(self.all) - self.all_m=self.all_s.mean - - self.x=np.array([np.concatenate((self.corr[i][1:i], - self.corr[i][i+1:])) for i in range(1,self.N+1)]) - # The above, although transposed, works because the self.correlation matrix - # is symmetric - self.xd=[stats.describe(self.x[i]) for i in range(self.N)] - self.xs=stats.describe(np.array([self.xd[i].mean for i in range(self.N)])) - self.xm=self.xs.mean - self.xsd=np.sqrt(self.xs.variance) - - self.x_ranks=[stats.rankdata(-self.counts[:,i],method='average') for i in range(1,self.N+1)] - - self.aa=ranks() - self.aa_by_all=self.aa[self.aa[:,1].argsort()] - - def qqa(self): - # q-q plot for the whole crawl - sm.qqplot(self.all, line='s') - plt.gca().set_title('Rank self.correlation per segment wrt whole archive %s'%id) - plt.show() - - def qqs(self): - # q-q plots for the best and worst (by variance) segments - global xv, xworst, xbest - xv=[d.variance for d in self.xd] - xworst=xv.index(max(xv)) - xbest=xv.index(min(xv)) - print(xbest,xworst) - sm.qqplot(self.x[xbest], line='s') - plt.gca().set_title('Best segment (least variance): %s'%xbest) - plt.show() - sm.qqplot(self.x[xworst], line='s') - plt.gca().set_title('Worst segment (most variance): %s'%xworst) - plt.show() - - def plot_x(sort=False,block=True,all_only=True,title=None): - # Make these two subplots, w. and w/o sorting - # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot - # for legend hacking - if sort: - aso=np.argsort(-self.all) - plot_all=self.all[aso] - plot_x=np.array([self.xd[i].mean for i in range(self.N)])[aso] - else: - plot_all=self.all - plot_x=[self.xd[i].mean for i in range(self.N)] - if title is None: - l1='Rank self.correlation of segment self.x whole crawl' - l2='Mean of segment self.x whole crawl' - plt.legend(loc='best',fontsize='small') - else: - l1=l2=None - plt.plot(plot_all,'rx',label=l1) - plt.plot([0,self.N-1],[self.all_m,self.all_m],'r',label=l2) - if not(all_only): - plt.plot(plot_x,'bx',label='Mean of rank self.correlation of each segment self.x self.all other segments') - plt.plot([0,self.N-1],[self.xm,self.xm],'b',label='Mean of segment self.x segment means') - plt.axis([0,self.N-1,0.85 if all_only else 0.8,1.0]) - plt.grid(True) - if title is not None: - plt.title(title) - plt.show(block=block) - - def hist_x(align='mid'): - hist(self.xm,self.xsd,[self.xd[i].mean for i in range(self.N)], - 'Mean of rank self.correlation of each segment self.x self.all other segments', - align) - - def hist_all(align='mid'): - hist(self.all_m,np.sqrt(self.all_s.variance),self.all, - 'Rank self.correlation of each segment self.x whole crawl %s'%id, - align) - - def hist(m,sd,hh,title,align): - sdd=[(i,m-(i*sd)) for i in range(-2,3)] - fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 - sdax=hax.twiny() - hax.hist(hh,color='lightblue',align=align) - hax.set_title(title) - for s,v in sdd: - sdax.plot([v,v],[0,18],'b') - sdax.set_xlim(hax.get_xlim()) - sdax.set_ylim(hax.get_ylim()) - sdax.set_xticks([v for s,v in sdd]) - sdax.set_xticklabels([str(s) for s,v in sdd]) - plt.show() - - def ci(rho,n,conf=0.95): - # Courtesy of https://stats.stackexchange.com/a/18904 - # rho is (rank) self.correlation, n is sample size - stderr=1.0/math.sqrt(n-3) - z=stats.norm.ppf(1.0-((1.0-conf)/2)) - delta=z*stderr - lower=math.tanh(math.atanh(rho)-delta) - upper=math.tanh(math.atanh(rho)+delta) - return (lower,upper) - - def plot_ci(rhos,n,trim=None,conf=0.95): - # rhos are (rank) self.correlation values - rhos_s=rhos[(-rhos).argsort()] - if trim is None: - l=len(rhos) - else: - rhos_s=rhos_s[:trim] - l=trim - cc=(np.array([ci(r,n,conf) for r in rhos_s])).T - ue=cc[1]-rhos_s - le=rhos_s-cc[0] - #for i in range(len(rhos)): - #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) - plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') - plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) - plt.show() - - def first_diff(ranks): - # first disagreement with baseline == {1,2,...} - for i in range(len(ranks)): - if ranks[i]!=i+1.0: - return i - return i+1 - - def ranks(): - # Combine segment measures: - # segID,rank self.corr. wrt self.all,inverse variance, mean cross rank self.corr.,first disagreement - # convert to ranks, smallest value == highest rank - all_ranked=stats.rankdata(-self.all,method='average') # invert since - # large self.corr is good - x_variance_ranked=stats.rankdata([self.xd[i].variance for i in range(self.N)]) - # small self.corr variance is good - x_mean_ranked=stats.rankdata([-(self.xd[i].mean) for i in range(self.N)]) - # invert since - # large mean self.corr is good - fd_ranked=stats.rankdata([-first_diff(self.x_ranks[i]) for i in range(self.N)]) - # invert since - # large first diff is good - return np.array([[i, - all_ranked[i], - x_variance_ranked[i], - x_mean_ranked[i], - fd_ranked[i]] for i in range(self.N)]) - -if __name__ == '__main__': - S = CCStats(*sys.argv[1:]) - -### I need to review rows, e.g. self.counts[0] is an array of self.N+1 self.counts -### for the most common label in the complete crawl, -### from the complete crawl and all the segments -### versus columns, e.g. self.counts[:,0] is an array of self.N decreasing self.counts -### for all the labels in the complete crawl