comparison bin/spearman.py @ 27:21da4d6521db

move all plots into functions
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 16 Nov 2022 17:28:56 +0000
parents 5c5440e7854a
children 669a0b120d34
comparison
equal deleted inserted replaced
26:5c5440e7854a 27:21da4d6521db
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 '''Rank correlation processing for a csv tabulation of counts by segment
3 First column is for whole crawl, then 100 columns for segs 0-99
4 Each row is counts for some property, e.g. mime-detected or tld
5
6 For example
7
8 tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
9
10 will produce such a file with 100 rows assuming all.tsv has the whole-crawl
11 warc-only counts and s...tsv have the segment counts, all counts in column 1
12
13 Usage: python3 -i spearman.py name
14 where name.csv has the input
15 '''
16
2 import numpy as np 17 import numpy as np
3 from numpy import loadtxt 18 from numpy import loadtxt
4 from scipy import stats 19 from scipy import stats
5 import statsmodels.api as sm 20 import statsmodels.api as sm
6 import matplotlib.pyplot as plt 21 import matplotlib.pyplot as plt
7 import pylab 22 import pylab
8 23
9 import sys 24 import sys
10 25
11 cc19=loadtxt(sys.argv[1],delimiter=',') 26 def qqa():
12 cc19s_o=stats.spearmanr(cc19,nan_policy='omit') 27 # q-q plot for the whole crawl
13 cc19s_x=np.array([np.concatenate((cc19s_o.correlation[i][1:i],cc19s_o.correlation[i][i+1:])) for i in range(1,101)]) 28 sm.qqplot(all, line='s')
14 cc19s_xd=[stats.describe(cc19s_x[i]) for i in range(100)] 29 plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)')
30 plt.show()
31
32 def qqs():
33 # q-q plots for the best and worst (by variance) segments
34 global xv, xworst, xbest
35 xv=[d.variance for d in xd]
36 xworst=xv.index(max(xv))
37 xbest=xv.index(min(xv))
38 print(xbest,xworst)
39 sm.qqplot(x[xbest], line='s')
40 plt.gca().set_title('Best segment (least variance): %s'%xbest)
41 plt.show()
42 sm.qqplot(x[xworst], line='s')
43 plt.gca().set_title('Worst segment (most variance): %s'%xworst)
44 plt.show()
45
46 def plot_x():
47 plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments')
48 plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means')
49 plt.plot(all,'rx',label='Rank correlation of segment x whole crawl')
50 plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl')
51 plt.axis([0,99,0.8,1.0])
52 plt.legend(loc='best')
53 plt.grid(True)
54 plt.show()
55
56 def hist():
57 sdd=[(i,xm-(i*xsd)) for i in range(-2,3)]
58 fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
59 sdax=hax.twiny()
60 hax.hist([xd[i].mean for i in range(100)],color='lightblue')
61 hax.set_title('Mean of rank correlation of each segment x all other segments')
62 for s,v in sdd:
63 sdax.plot([v,v],[0,18],'b')
64 sdax.set_xlim(hax.get_xlim())
65 sdax.set_ylim(hax.get_ylim())
66 sdax.set_xticks([v for s,v in sdd])
67 sdax.set_xticklabels([str(s) for s,v in sdd])
68 plt.show()
69
70 counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
71 o=stats.spearmanr(counts,nan_policy='omit')
72
73 all=o.correlation[0][1:]
74 all_s=stats.describe(all)
75 all_m=all_s.mean
76 # Should get the confidence interval for this, so we can
77 # use it in plot_x
78
79 x=np.array([np.concatenate((o.correlation[i][1:i],o.correlation[i][i+1:])) for i in range(1,101)])
80 xd=[stats.describe(x[i]) for i in range(100)]
81 xs=stats.describe(np.array([xd[i].mean for i in range(100)]))
82 xm=xs.mean
83 xsd=np.sqrt(xs.variance)