Mercurial > hg > cc > cirrus_work
changeset 27:21da4d6521db
move all plots into functions
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 16 Nov 2022 17:28:56 +0000 |
parents | 5c5440e7854a |
children | 7ffb686ca060 |
files | bin/spearman.py |
diffstat | 1 files changed, 73 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/spearman.py Tue Nov 15 19:37:28 2022 +0000 +++ b/bin/spearman.py Wed Nov 16 17:28:56 2022 +0000 @@ -1,4 +1,19 @@ #!/usr/bin/env python3 +'''Rank correlation processing for a csv tabulation of counts by segment + First column is for whole crawl, then 100 columns for segs 0-99 + Each row is counts for some property, e.g. mime-detected or tld + + For example + + tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv + + will produce such a file with 100 rows assuming all.tsv has the whole-crawl + warc-only counts and s...tsv have the segment counts, all counts in column 1 + + Usage: python3 -i spearman.py name + where name.csv has the input +''' + import numpy as np from numpy import loadtxt from scipy import stats @@ -8,7 +23,61 @@ import sys -cc19=loadtxt(sys.argv[1],delimiter=',') -cc19s_o=stats.spearmanr(cc19,nan_policy='omit') -cc19s_x=np.array([np.concatenate((cc19s_o.correlation[i][1:i],cc19s_o.correlation[i][i+1:])) for i in range(1,101)]) -cc19s_xd=[stats.describe(cc19s_x[i]) for i in range(100)] +def qqa(): + # q-q plot for the whole crawl + sm.qqplot(all, line='s') + plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)') + plt.show() + +def qqs(): + # q-q plots for the best and worst (by variance) segments + global xv, xworst, xbest + xv=[d.variance for d in xd] + xworst=xv.index(max(xv)) + xbest=xv.index(min(xv)) + print(xbest,xworst) + sm.qqplot(x[xbest], line='s') + plt.gca().set_title('Best segment (least variance): %s'%xbest) + plt.show() + sm.qqplot(x[xworst], line='s') + plt.gca().set_title('Worst segment (most variance): %s'%xworst) + plt.show() + +def plot_x(): + plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments') + plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means') + plt.plot(all,'rx',label='Rank correlation of segment x whole crawl') + plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl') + plt.axis([0,99,0.8,1.0]) + plt.legend(loc='best') + plt.grid(True) + plt.show() + +def hist(): + sdd=[(i,xm-(i*xsd)) for i in range(-2,3)] + fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 + sdax=hax.twiny() + hax.hist([xd[i].mean for i in range(100)],color='lightblue') + hax.set_title('Mean of rank correlation of each segment x all other segments') + for s,v in sdd: + sdax.plot([v,v],[0,18],'b') + sdax.set_xlim(hax.get_xlim()) + sdax.set_ylim(hax.get_ylim()) + sdax.set_xticks([v for s,v in sdd]) + sdax.set_xticklabels([str(s) for s,v in sdd]) + plt.show() + +counts=loadtxt(sys.argv[1]+".csv",delimiter=',') +o=stats.spearmanr(counts,nan_policy='omit') + +all=o.correlation[0][1:] +all_s=stats.describe(all) +all_m=all_s.mean +# Should get the confidence interval for this, so we can +# use it in plot_x + +x=np.array([np.concatenate((o.correlation[i][1:i],o.correlation[i][i+1:])) for i in range(1,101)]) +xd=[stats.describe(x[i]) for i in range(100)] +xs=stats.describe(np.array([xd[i].mean for i in range(100)])) +xm=xs.mean +xsd=np.sqrt(xs.variance)