Mercurial > hg > cc > cirrus_work
comparison bin/spearman.py @ 27:21da4d6521db
move all plots into functions
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 16 Nov 2022 17:28:56 +0000 |
parents | 5c5440e7854a |
children | 669a0b120d34 |
comparison
equal
deleted
inserted
replaced
26:5c5440e7854a | 27:21da4d6521db |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 '''Rank correlation processing for a csv tabulation of counts by segment | |
3 First column is for whole crawl, then 100 columns for segs 0-99 | |
4 Each row is counts for some property, e.g. mime-detected or tld | |
5 | |
6 For example | |
7 | |
8 tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv | |
9 | |
10 will produce such a file with 100 rows assuming all.tsv has the whole-crawl | |
11 warc-only counts and s...tsv have the segment counts, all counts in column 1 | |
12 | |
13 Usage: python3 -i spearman.py name | |
14 where name.csv has the input | |
15 ''' | |
16 | |
2 import numpy as np | 17 import numpy as np |
3 from numpy import loadtxt | 18 from numpy import loadtxt |
4 from scipy import stats | 19 from scipy import stats |
5 import statsmodels.api as sm | 20 import statsmodels.api as sm |
6 import matplotlib.pyplot as plt | 21 import matplotlib.pyplot as plt |
7 import pylab | 22 import pylab |
8 | 23 |
9 import sys | 24 import sys |
10 | 25 |
11 cc19=loadtxt(sys.argv[1],delimiter=',') | 26 def qqa(): |
12 cc19s_o=stats.spearmanr(cc19,nan_policy='omit') | 27 # q-q plot for the whole crawl |
13 cc19s_x=np.array([np.concatenate((cc19s_o.correlation[i][1:i],cc19s_o.correlation[i][i+1:])) for i in range(1,101)]) | 28 sm.qqplot(all, line='s') |
14 cc19s_xd=[stats.describe(cc19s_x[i]) for i in range(100)] | 29 plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)') |
30 plt.show() | |
31 | |
32 def qqs(): | |
33 # q-q plots for the best and worst (by variance) segments | |
34 global xv, xworst, xbest | |
35 xv=[d.variance for d in xd] | |
36 xworst=xv.index(max(xv)) | |
37 xbest=xv.index(min(xv)) | |
38 print(xbest,xworst) | |
39 sm.qqplot(x[xbest], line='s') | |
40 plt.gca().set_title('Best segment (least variance): %s'%xbest) | |
41 plt.show() | |
42 sm.qqplot(x[xworst], line='s') | |
43 plt.gca().set_title('Worst segment (most variance): %s'%xworst) | |
44 plt.show() | |
45 | |
46 def plot_x(): | |
47 plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments') | |
48 plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means') | |
49 plt.plot(all,'rx',label='Rank correlation of segment x whole crawl') | |
50 plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl') | |
51 plt.axis([0,99,0.8,1.0]) | |
52 plt.legend(loc='best') | |
53 plt.grid(True) | |
54 plt.show() | |
55 | |
56 def hist(): | |
57 sdd=[(i,xm-(i*xsd)) for i in range(-2,3)] | |
58 fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 | |
59 sdax=hax.twiny() | |
60 hax.hist([xd[i].mean for i in range(100)],color='lightblue') | |
61 hax.set_title('Mean of rank correlation of each segment x all other segments') | |
62 for s,v in sdd: | |
63 sdax.plot([v,v],[0,18],'b') | |
64 sdax.set_xlim(hax.get_xlim()) | |
65 sdax.set_ylim(hax.get_ylim()) | |
66 sdax.set_xticks([v for s,v in sdd]) | |
67 sdax.set_xticklabels([str(s) for s,v in sdd]) | |
68 plt.show() | |
69 | |
70 counts=loadtxt(sys.argv[1]+".csv",delimiter=',') | |
71 o=stats.spearmanr(counts,nan_policy='omit') | |
72 | |
73 all=o.correlation[0][1:] | |
74 all_s=stats.describe(all) | |
75 all_m=all_s.mean | |
76 # Should get the confidence interval for this, so we can | |
77 # use it in plot_x | |
78 | |
79 x=np.array([np.concatenate((o.correlation[i][1:i],o.correlation[i][i+1:])) for i in range(1,101)]) | |
80 xd=[stats.describe(x[i]) for i in range(100)] | |
81 xs=stats.describe(np.array([xd[i].mean for i in range(100)])) | |
82 xm=xs.mean | |
83 xsd=np.sqrt(xs.variance) |