Mercurial > hg > cc > cirrus_work
changeset 32:91741bf3ab51
add sort flag to plot_x
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 22 Nov 2022 11:02:51 +0000 |
parents | e7c8e64c2fdd |
children | 317bf47b506c |
files | bin/spearman.py |
diffstat | 1 files changed, 27 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/spearman.py Thu Nov 17 13:51:19 2022 +0000 +++ b/bin/spearman.py Tue Nov 22 11:02:51 2022 +0000 @@ -45,12 +45,20 @@ plt.gca().set_title('Worst segment (most variance): %s'%xworst) plt.show() -def plot_x(block=True): - plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments') - plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means') - plt.plot(all,'rx',label='Rank correlation of segment x whole crawl') - plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl') - plt.axis([0,99,0.8,1.0]) +def plot_x(sort=False,block=True): + # Make these two subplots... + if sort: + aso=np.argsort(-all) + plot_all=all[aso] + plot_x=np.array([xd[i].mean for i in range(N)])[aso] + else: + plot_all=all + plot_x=[xd[i].mean for i in range(N)] + plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl') + plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl') + plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') + plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') + plt.axis([0,N-1,0.8,1.0]) plt.legend(loc='best') plt.grid(True) plt.show(block=block) @@ -59,7 +67,7 @@ sdd=[(i,xm-(i*xsd)) for i in range(-2,3)] fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 sdax=hax.twiny() - hax.hist([xd[i].mean for i in range(100)],color='lightblue') + hax.hist([xd[i].mean for i in range(N)],color='lightblue') hax.set_title('Mean of rank correlation of each segment x all other segments') for s,v in sdd: sdax.plot([v,v],[0,18],'b') @@ -82,25 +90,26 @@ # convert to ranks, smallest value == highest rank all_ranked=stats.rankdata(-all,method='average') # invert since # large corr is good - x_variance_ranked=stats.rankdata([xd[i].variance for i in range(100)]) + x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)]) # small corr variance is good - x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(100)]) + x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)]) # invert since # large mean corr is good - fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(100)]) + fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)]) # invert since # large first diff is good return np.array([[i, all_ranked[i], x_variance_ranked[i], x_mean_ranked[i], - fd_ranked[i]] for i in range(100)]) + fd_ranked[i]] for i in range(N)]) counts=loadtxt(sys.argv[1]+".csv",delimiter=',') +N=counts.shape[0] # "If axis=0 (default), then each column represents a variable, with # observations in the rows" # So each column is a sequence of counts, for whole crawl in column 0 -# and for segments 0--99 in columns 1--100 +# and for segments 0--N-1 in columns 1--N corr=stats.spearmanr(counts,nan_policy='omit').correlation all=corr[0][1:] @@ -108,18 +117,18 @@ all_m=all_s.mean x=np.array([np.concatenate((corr[i][1:i], - corr[i][i+1:])) for i in range(1,101)]) + corr[i][i+1:])) for i in range(1,N+1)]) # The above, although transposed, works because the correlation matrix # is symmetric -xd=[stats.describe(x[i]) for i in range(100)] -xs=stats.describe(np.array([xd[i].mean for i in range(100)])) +xd=[stats.describe(x[i]) for i in range(N)] +xs=stats.describe(np.array([xd[i].mean for i in range(N)])) xm=xs.mean xsd=np.sqrt(xs.variance) -x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,101)] +x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)] -### I need to review rows, e.g. counts[0] is an array of 101 counts +### I need to review rows, e.g. counts[0] is an array of N+1 counts ### for the most common label in the complete crawl, ### from the complete crawl and all the segments -### versus columns, e.g. counts[:,0] is an array of 100 decreasing counts +### versus columns, e.g. counts[:,0] is an array of N decreasing counts ### for all the labels in the complete crawl