Mercurial > hg > cc > cirrus_work
changeset 37:da1cbcd8acee
push actions in main fn
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 13 Dec 2022 14:16:42 +0000 |
parents | 6b3a9ac5ad7b |
children | e3c440666f1a |
files | bin/spearman.py |
diffstat | 1 files changed, 52 insertions(+), 34 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/spearman.py Tue Dec 13 14:16:22 2022 +0000 +++ b/bin/spearman.py Tue Dec 13 14:16:42 2022 +0000 @@ -12,7 +12,7 @@ * 100 rows, one for each of the top 100 counts * 101 columns, 0 for all and 1--100 for segs 0--99 - Usage: python3 -i spearman.py name + Usage: python3 -i spearman.py name id where name.csv has the input ''' @@ -28,7 +28,7 @@ def qqa(): # q-q plot for the whole crawl sm.qqplot(all, line='s') - plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)') + plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id) plt.show() def qqs(): @@ -45,7 +45,7 @@ plt.gca().set_title('Worst segment (most variance): %s'%xworst) plt.show() -def plot_x(sort=False,block=True): +def plot_x(sort=False,block=True,all_only=True,title=None): # Make these two subplots, w. and w/o sorting # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot # for legend hacking @@ -56,28 +56,38 @@ else: plot_all=all plot_x=[xd[i].mean for i in range(N)] - plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl') - plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl') - plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') - plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') - plt.axis([0,N-1,0.8,1.0]) - plt.legend(loc='best',fontsize='small') + if title is None: + l1='Rank correlation of segment x whole crawl' + l2='Mean of segment x whole crawl' + plt.legend(loc='best',fontsize='small') + else: + l1=l2=None + plt.plot(plot_all,'rx',label=l1) + plt.plot([0,N-1],[all_m,all_m],'r',label=l2) + if not(all_only): + plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') + plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') + plt.axis([0,N-1,0.85 if all_only else 0.8,1.0]) plt.grid(True) + if title is not None: + plt.title(title) plt.show(block=block) -def hist_x(): +def hist_x(align='mid'): hist(xm,xsd,[xd[i].mean for i in range(N)], - 'Mean of rank correlation of each segment x all other segments') + 'Mean of rank correlation of each segment x all other segments', + align) -def hist_all(): +def hist_all(align='mid'): hist(all_m,np.sqrt(all_s.variance),all, - 'Rank correlation of each segment x whole crawl') + 'Rank correlation of each segment x whole crawl %s'%id, + align) -def hist(m,sd,hh,title): +def hist(m,sd,hh,title,align): sdd=[(i,m-(i*sd)) for i in range(-2,3)] fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 sdax=hax.twiny() - hax.hist(hh,color='lightblue') + hax.hist(hh,color='lightblue',align=align) hax.set_title(title) for s,v in sdd: sdax.plot([v,v],[0,18],'b') @@ -111,6 +121,7 @@ #for i in range(len(rhos)): #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') + plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) plt.show() def first_diff(ranks): @@ -140,28 +151,35 @@ x_mean_ranked[i], fd_ranked[i]] for i in range(N)]) -counts=loadtxt(sys.argv[1]+".csv",delimiter=',') -N=counts.shape[1]-1 -# "If axis=0 (default), then each column represents a variable, with -# observations in the rows" -# So each column is a sequence of counts, for whole crawl in column 0 -# and for segments 0--N-1 in columns 1--N -corr=stats.spearmanr(counts,nan_policy='omit').correlation +def main(): + global counts, id, corr, all, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr + global aa, aa_by_all, N + counts=loadtxt(sys.argv[1]+".csv",delimiter=',') + id=sys.argv[2] + N=counts.shape[1]-1 + # "If axis=0 (default), then each column represents a variable, with + # observations in the rows" + # So each column is a sequence of counts, for whole crawl in column 0 + # and for segments 0--N-1 in columns 1--N + corr=stats.spearmanr(counts,nan_policy='omit').correlation -all=corr[0][1:] -all_s=stats.describe(all) -all_m=all_s.mean + all=corr[0][1:] + all_s=stats.describe(all) + all_m=all_s.mean -x=np.array([np.concatenate((corr[i][1:i], - corr[i][i+1:])) for i in range(1,N+1)]) -# The above, although transposed, works because the correlation matrix -# is symmetric -xd=[stats.describe(x[i]) for i in range(N)] -xs=stats.describe(np.array([xd[i].mean for i in range(N)])) -xm=xs.mean -xsd=np.sqrt(xs.variance) + x=np.array([np.concatenate((corr[i][1:i], + corr[i][i+1:])) for i in range(1,N+1)]) + # The above, although transposed, works because the correlation matrix + # is symmetric + xd=[stats.describe(x[i]) for i in range(N)] + xs=stats.describe(np.array([xd[i].mean for i in range(N)])) + xm=xs.mean + xsd=np.sqrt(xs.variance) -x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)] + x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)] + + aa=ranks() + aa_by_all=aa[aa[:,1].argsort()] ### I need to review rows, e.g. counts[0] is an array of N+1 counts ### for the most common label in the complete crawl,