cc/cirrus_work: bin/spearman.py comparison

comparison bin/spearman.py @ 27:21da4d6521db

move all plots into functions

author	Henry S. Thompson <ht@inf.ed.ac.uk>
date	Wed, 16 Nov 2022 17:28:56 +0000
parents	5c5440e7854a
children	669a0b120d34

comparison

equal deleted inserted replaced

-:5c5440e7854a
+:21da4d6521db
 #!/usr/bin/env python3
+'''Rank correlation processing for a csv tabulation of counts by segment
+First column is for whole crawl, then 100 columns for segs 0-99
+Each row is counts for some property, e.g. mime-detected or tld
+For example
+tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
+will produce such a file with 100 rows assuming all.tsv has the whole-crawl
+warc-only counts and s...tsv have the segment counts, all counts in column 1
+Usage: python3 -i spearman.py name
+where name.csv has the input
+'''
 import numpy as np
 from numpy import loadtxt
 from scipy import stats
 import statsmodels.api as sm
 import matplotlib.pyplot as plt
 import pylab
 import sys
-cc19=loadtxt(sys.argv[1],delimiter=',')
+def qqa():
-cc19s_o=stats.spearmanr(cc19,nan_policy='omit')
+# q-q plot for the whole crawl
-cc19s_x=np.array([np.concatenate((cc19s_o.correlation[i][1:i],cc19s_o.correlation[i][i+1:])) for i in range(1,101)])
+sm.qqplot(all, line='s')
-cc19s_xd=[stats.describe(cc19s_x[i]) for i in range(100)]
+plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)')
+plt.show()
+def qqs():
+# q-q plots for the best and worst (by variance) segments
+global xv, xworst, xbest
+xv=[d.variance for d in xd]
+xworst=xv.index(max(xv))
+xbest=xv.index(min(xv))
+print(xbest,xworst)
+sm.qqplot(x[xbest], line='s')
+plt.gca().set_title('Best segment (least variance): %s'%xbest)
+plt.show()
+sm.qqplot(x[xworst], line='s')
+plt.gca().set_title('Worst segment (most variance): %s'%xworst)
+plt.show()
+def plot_x():
+plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments')
+plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means')
+plt.plot(all,'rx',label='Rank correlation of segment x whole crawl')
+plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl')
+plt.axis([0,99,0.8,1.0])
+plt.legend(loc='best')
+plt.grid(True)
+plt.show()
+def hist():
+sdd=[(i,xm-(i*xsd)) for i in range(-2,3)]
+fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
+sdax=hax.twiny()
+hax.hist([xd[i].mean for i in range(100)],color='lightblue')
+hax.set_title('Mean of rank correlation of each segment x all other segments')
+for s,v in sdd:
+sdax.plot([v,v],[0,18],'b')
+sdax.set_xlim(hax.get_xlim())
+sdax.set_ylim(hax.get_ylim())
+sdax.set_xticks([v for s,v in sdd])
+sdax.set_xticklabels([str(s) for s,v in sdd])
+plt.show()
+counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
+o=stats.spearmanr(counts,nan_policy='omit')
+all=o.correlation[0][1:]
+all_s=stats.describe(all)
+all_m=all_s.mean
+# Should get the confidence interval for this, so we can
+#  use it in plot_x
+x=np.array([np.concatenate((o.correlation[i][1:i],o.correlation[i][i+1:])) for i in range(1,101)])
+xd=[stats.describe(x[i]) for i in range(100)]
+xs=stats.describe(np.array([xd[i].mean for i in range(100)]))
+xm=xs.mean
+xsd=np.sqrt(xs.variance)

Mercurial > hg > cc > cirrus_work

comparison bin/spearman.py @ 27:21da4d6521db