Mercurial > hg > cc > cirrus_work
comparison bin/spearman.py @ 29:669a0b120d34
start work on ranking,
lose faith in getting row vs. column correct every time
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 16 Nov 2022 19:52:50 +0000 |
parents | 21da4d6521db |
children | c73ec9deabbe |
comparison
equal
deleted
inserted
replaced
28:7ffb686ca060 | 29:669a0b120d34 |
---|---|
65 sdax.set_ylim(hax.get_ylim()) | 65 sdax.set_ylim(hax.get_ylim()) |
66 sdax.set_xticks([v for s,v in sdd]) | 66 sdax.set_xticks([v for s,v in sdd]) |
67 sdax.set_xticklabels([str(s) for s,v in sdd]) | 67 sdax.set_xticklabels([str(s) for s,v in sdd]) |
68 plt.show() | 68 plt.show() |
69 | 69 |
70 def first_diff(ranks): | |
71 # first disagreement with baseline == {1,2,...} | |
72 for i in range(len(ranks)): | |
73 if ranks[i]!=i+1.0: | |
74 return i | |
75 return i+1 | |
76 | |
77 def ranks(): | |
78 # Combine segment measures: | |
79 # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement | |
80 return np.array([i,all[i],1.0/xd[i].variance,xd[i].mean,first_diff(ranks[i])]) | |
81 | |
70 counts=loadtxt(sys.argv[1]+".csv",delimiter=',') | 82 counts=loadtxt(sys.argv[1]+".csv",delimiter=',') |
71 o=stats.spearmanr(counts,nan_policy='omit') | 83 # "If axis=0 (default), then each column represents a variable, with |
84 # observations in the rows" | |
85 ranks=[stats.rankdata(-counts[i],method='average') for for i in range(1,100)] | |
86 corr=stats.spearmanr(counts,nan_policy='omit').correlation | |
72 | 87 |
73 all=o.correlation[0][1:] | 88 all=corr[0][1:] |
74 all_s=stats.describe(all) | 89 all_s=stats.describe(all) |
75 all_m=all_s.mean | 90 all_m=all_s.mean |
76 # Should get the confidence interval for this, so we can | |
77 # use it in plot_x | |
78 | 91 |
79 x=np.array([np.concatenate((o.correlation[i][1:i],o.correlation[i][i+1:])) for i in range(1,101)]) | 92 x=np.array([np.concatenate((corr[i][1:i], |
93 corr[i][i+1:])) for i in range(1,101)]) | |
80 xd=[stats.describe(x[i]) for i in range(100)] | 94 xd=[stats.describe(x[i]) for i in range(100)] |
81 xs=stats.describe(np.array([xd[i].mean for i in range(100)])) | 95 xs=stats.describe(np.array([xd[i].mean for i in range(100)])) |
82 xm=xs.mean | 96 xm=xs.mean |
83 xsd=np.sqrt(xs.variance) | 97 xsd=np.sqrt(xs.variance) |
98 | |
99 ### I need to review rows, e.g. counts[0] is an array of 101 counts | |
100 ### for the most common label in the complete crawl, | |
101 ### from the complete crawl and all the segments | |
102 ### versus columns, e.g. counts[:,0] is an array of 100 decreasing counts | |
103 ### for all the labels in the complete crawl |