Mercurial > hg > cc > cirrus_work
annotate bin/spearman.py @ 72:fd9bcd759606
work with csing
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 08 Aug 2023 17:46:20 +0100 |
parents | da1cbcd8acee |
children |
rev | line source |
---|---|
25
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
2 '''Rank correlation processing for a csv tabulation of counts by segment |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
3 First column is for whole crawl, then 100 columns for segs 0-99 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
4 Each row is counts for some property, e.g. mime-detected or tld |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
5 |
30
c73ec9deabbe
comments and more care about rows vs. columns
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
29
diff
changeset
|
6 For example, assuming all.tsv has the whole-crawl warc-only counts |
c73ec9deabbe
comments and more care about rows vs. columns
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
29
diff
changeset
|
7 and s...tsv have the segment counts, all with counts in column 1, |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
8 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
9 tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
10 |
30
c73ec9deabbe
comments and more care about rows vs. columns
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
29
diff
changeset
|
11 will produce such a file with |
c73ec9deabbe
comments and more care about rows vs. columns
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
29
diff
changeset
|
12 * 100 rows, one for each of the top 100 counts |
c73ec9deabbe
comments and more care about rows vs. columns
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
29
diff
changeset
|
13 * 101 columns, 0 for all and 1--100 for segs 0--99 |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
14 |
37 | 15 Usage: python3 -i spearman.py name id |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
16 where name.csv has the input |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
17 ''' |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
18 |
25
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 import numpy as np |
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 from numpy import loadtxt |
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 from scipy import stats |
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 import statsmodels.api as sm |
26 | 23 import matplotlib.pyplot as plt |
25
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 import pylab |
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 |
34
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
26 import sys, math |
25
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
28 def qqa(): |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
29 # q-q plot for the whole crawl |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
30 sm.qqplot(all, line='s') |
37 | 31 plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id) |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
32 plt.show() |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
33 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
34 def qqs(): |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
35 # q-q plots for the best and worst (by variance) segments |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
36 global xv, xworst, xbest |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
37 xv=[d.variance for d in xd] |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
38 xworst=xv.index(max(xv)) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
39 xbest=xv.index(min(xv)) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
40 print(xbest,xworst) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
41 sm.qqplot(x[xbest], line='s') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
42 plt.gca().set_title('Best segment (least variance): %s'%xbest) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
43 plt.show() |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
44 sm.qqplot(x[xworst], line='s') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
45 plt.gca().set_title('Worst segment (most variance): %s'%xworst) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
46 plt.show() |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
47 |
37 | 48 def plot_x(sort=False,block=True,all_only=True,title=None): |
34
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
49 # Make these two subplots, w. and w/o sorting |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
50 # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
51 # for legend hacking |
32 | 52 if sort: |
53 aso=np.argsort(-all) | |
54 plot_all=all[aso] | |
55 plot_x=np.array([xd[i].mean for i in range(N)])[aso] | |
56 else: | |
57 plot_all=all | |
58 plot_x=[xd[i].mean for i in range(N)] | |
37 | 59 if title is None: |
60 l1='Rank correlation of segment x whole crawl' | |
61 l2='Mean of segment x whole crawl' | |
62 plt.legend(loc='best',fontsize='small') | |
63 else: | |
64 l1=l2=None | |
65 plt.plot(plot_all,'rx',label=l1) | |
66 plt.plot([0,N-1],[all_m,all_m],'r',label=l2) | |
67 if not(all_only): | |
68 plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') | |
69 plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') | |
70 plt.axis([0,N-1,0.85 if all_only else 0.8,1.0]) | |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
71 plt.grid(True) |
37 | 72 if title is not None: |
73 plt.title(title) | |
31
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
74 plt.show(block=block) |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
75 |
37 | 76 def hist_x(align='mid'): |
33 | 77 hist(xm,xsd,[xd[i].mean for i in range(N)], |
37 | 78 'Mean of rank correlation of each segment x all other segments', |
79 align) | |
33 | 80 |
37 | 81 def hist_all(align='mid'): |
33 | 82 hist(all_m,np.sqrt(all_s.variance),all, |
37 | 83 'Rank correlation of each segment x whole crawl %s'%id, |
84 align) | |
33 | 85 |
37 | 86 def hist(m,sd,hh,title,align): |
33 | 87 sdd=[(i,m-(i*sd)) for i in range(-2,3)] |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
88 fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
89 sdax=hax.twiny() |
37 | 90 hax.hist(hh,color='lightblue',align=align) |
33 | 91 hax.set_title(title) |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
92 for s,v in sdd: |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
93 sdax.plot([v,v],[0,18],'b') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
94 sdax.set_xlim(hax.get_xlim()) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
95 sdax.set_ylim(hax.get_ylim()) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
96 sdax.set_xticks([v for s,v in sdd]) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
97 sdax.set_xticklabels([str(s) for s,v in sdd]) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
98 plt.show() |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
99 |
34
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
100 def ci(rho,n,conf=0.95): |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
101 # Courtesy of https://stats.stackexchange.com/a/18904 |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
102 # rho is (rank) correlation, n is sample size |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
103 stderr=1.0/math.sqrt(n-3) |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
104 z=stats.norm.ppf(1.0-((1.0-conf)/2)) |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
105 delta=z*stderr |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
106 lower=math.tanh(math.atanh(rho)-delta) |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
107 upper=math.tanh(math.atanh(rho)+delta) |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
108 return (lower,upper) |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
109 |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
110 def plot_ci(rhos,n,trim=None,conf=0.95): |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
111 # rhos are (rank) correlation values |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
112 rhos_s=rhos[(-rhos).argsort()] |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
113 if trim is None: |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
114 l=len(rhos) |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
115 else: |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
116 rhos_s=rhos_s[:trim] |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
117 l=trim |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
118 cc=(np.array([ci(r,n,conf) for r in rhos_s])).T |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
119 ue=cc[1]-rhos_s |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
120 le=rhos_s-cc[0] |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
121 #for i in range(len(rhos)): |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
122 #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
123 plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') |
37 | 124 plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) |
34
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
125 plt.show() |
2e8002a64f72
compute and graph confidence intervals
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
126 |
29 | 127 def first_diff(ranks): |
128 # first disagreement with baseline == {1,2,...} | |
129 for i in range(len(ranks)): | |
130 if ranks[i]!=i+1.0: | |
131 return i | |
132 return i+1 | |
133 | |
134 def ranks(): | |
135 # Combine segment measures: | |
136 # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement | |
31
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
137 # convert to ranks, smallest value == highest rank |
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
138 all_ranked=stats.rankdata(-all,method='average') # invert since |
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
139 # large corr is good |
32 | 140 x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)]) |
31
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
141 # small corr variance is good |
32 | 142 x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)]) |
31
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
143 # invert since |
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
144 # large mean corr is good |
32 | 145 fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)]) |
31
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
146 # invert since |
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
147 # large first diff is good |
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
148 return np.array([[i, |
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
149 all_ranked[i], |
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
150 x_variance_ranked[i], |
e7c8e64c2fdd
get multi-ranking done right
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
151 x_mean_ranked[i], |
32 | 152 fd_ranked[i]] for i in range(N)]) |
29 | 153 |
37 | 154 def main(): |
155 global counts, id, corr, all, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr | |
156 global aa, aa_by_all, N | |
157 counts=loadtxt(sys.argv[1]+".csv",delimiter=',') | |
158 id=sys.argv[2] | |
159 N=counts.shape[1]-1 | |
160 # "If axis=0 (default), then each column represents a variable, with | |
161 # observations in the rows" | |
162 # So each column is a sequence of counts, for whole crawl in column 0 | |
163 # and for segments 0--N-1 in columns 1--N | |
164 corr=stats.spearmanr(counts,nan_policy='omit').correlation | |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
165 |
37 | 166 all=corr[0][1:] |
167 all_s=stats.describe(all) | |
168 all_m=all_s.mean | |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
169 |
37 | 170 x=np.array([np.concatenate((corr[i][1:i], |
171 corr[i][i+1:])) for i in range(1,N+1)]) | |
172 # The above, although transposed, works because the correlation matrix | |
173 # is symmetric | |
174 xd=[stats.describe(x[i]) for i in range(N)] | |
175 xs=stats.describe(np.array([xd[i].mean for i in range(N)])) | |
176 xm=xs.mean | |
177 xsd=np.sqrt(xs.variance) | |
29 | 178 |
37 | 179 x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)] |
180 | |
181 aa=ranks() | |
182 aa_by_all=aa[aa[:,1].argsort()] | |
30
c73ec9deabbe
comments and more care about rows vs. columns
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
29
diff
changeset
|
183 |
32 | 184 ### I need to review rows, e.g. counts[0] is an array of N+1 counts |
29 | 185 ### for the most common label in the complete crawl, |
186 ### from the complete crawl and all the segments | |
32 | 187 ### versus columns, e.g. counts[:,0] is an array of N decreasing counts |
29 | 188 ### for all the labels in the complete crawl |