Mercurial > hg > cc > cirrus_work
annotate bin/spearman.py @ 29:669a0b120d34
start work on ranking,
lose faith in getting row vs. column correct every time
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 16 Nov 2022 19:52:50 +0000 |
parents | 21da4d6521db |
children | c73ec9deabbe |
rev | line source |
---|---|
25
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
2 '''Rank correlation processing for a csv tabulation of counts by segment |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
3 First column is for whole crawl, then 100 columns for segs 0-99 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
4 Each row is counts for some property, e.g. mime-detected or tld |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
5 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
6 For example |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
7 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
8 tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
9 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
10 will produce such a file with 100 rows assuming all.tsv has the whole-crawl |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
11 warc-only counts and s...tsv have the segment counts, all counts in column 1 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
12 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
13 Usage: python3 -i spearman.py name |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
14 where name.csv has the input |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
15 ''' |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
16 |
25
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 import numpy as np |
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 from numpy import loadtxt |
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 from scipy import stats |
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 import statsmodels.api as sm |
26 | 21 import matplotlib.pyplot as plt |
25
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 import pylab |
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 |
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 import sys |
50337cd1d16f
framework for stats over results of rank correlations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
26 def qqa(): |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
27 # q-q plot for the whole crawl |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
28 sm.qqplot(all, line='s') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
29 plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
30 plt.show() |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
31 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
32 def qqs(): |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
33 # q-q plots for the best and worst (by variance) segments |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
34 global xv, xworst, xbest |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
35 xv=[d.variance for d in xd] |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
36 xworst=xv.index(max(xv)) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
37 xbest=xv.index(min(xv)) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
38 print(xbest,xworst) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
39 sm.qqplot(x[xbest], line='s') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
40 plt.gca().set_title('Best segment (least variance): %s'%xbest) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
41 plt.show() |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
42 sm.qqplot(x[xworst], line='s') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
43 plt.gca().set_title('Worst segment (most variance): %s'%xworst) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
44 plt.show() |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
45 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
46 def plot_x(): |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
47 plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
48 plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
49 plt.plot(all,'rx',label='Rank correlation of segment x whole crawl') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
50 plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
51 plt.axis([0,99,0.8,1.0]) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
52 plt.legend(loc='best') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
53 plt.grid(True) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
54 plt.show() |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
55 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
56 def hist(): |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
57 sdd=[(i,xm-(i*xsd)) for i in range(-2,3)] |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
58 fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
59 sdax=hax.twiny() |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
60 hax.hist([xd[i].mean for i in range(100)],color='lightblue') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
61 hax.set_title('Mean of rank correlation of each segment x all other segments') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
62 for s,v in sdd: |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
63 sdax.plot([v,v],[0,18],'b') |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
64 sdax.set_xlim(hax.get_xlim()) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
65 sdax.set_ylim(hax.get_ylim()) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
66 sdax.set_xticks([v for s,v in sdd]) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
67 sdax.set_xticklabels([str(s) for s,v in sdd]) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
68 plt.show() |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
69 |
29 | 70 def first_diff(ranks): |
71 # first disagreement with baseline == {1,2,...} | |
72 for i in range(len(ranks)): | |
73 if ranks[i]!=i+1.0: | |
74 return i | |
75 return i+1 | |
76 | |
77 def ranks(): | |
78 # Combine segment measures: | |
79 # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement | |
80 return np.array([i,all[i],1.0/xd[i].variance,xd[i].mean,first_diff(ranks[i])]) | |
81 | |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
82 counts=loadtxt(sys.argv[1]+".csv",delimiter=',') |
29 | 83 # "If axis=0 (default), then each column represents a variable, with |
84 # observations in the rows" | |
85 ranks=[stats.rankdata(-counts[i],method='average') for for i in range(1,100)] | |
86 corr=stats.spearmanr(counts,nan_policy='omit').correlation | |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
87 |
29 | 88 all=corr[0][1:] |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
89 all_s=stats.describe(all) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
90 all_m=all_s.mean |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
91 |
29 | 92 x=np.array([np.concatenate((corr[i][1:i], |
93 corr[i][i+1:])) for i in range(1,101)]) | |
27
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
94 xd=[stats.describe(x[i]) for i in range(100)] |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
95 xs=stats.describe(np.array([xd[i].mean for i in range(100)])) |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
96 xm=xs.mean |
21da4d6521db
move all plots into functions
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
26
diff
changeset
|
97 xsd=np.sqrt(xs.variance) |
29 | 98 |
99 ### I need to review rows, e.g. counts[0] is an array of 101 counts | |
100 ### for the most common label in the complete crawl, | |
101 ### from the complete crawl and all the segments | |
102 ### versus columns, e.g. counts[:,0] is an array of 100 decreasing counts | |
103 ### for all the labels in the complete crawl |