Mercurial > hg > cc > cirrus_work
comparison bin/spearman.py @ 32:91741bf3ab51
add sort flag to plot_x
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 22 Nov 2022 11:02:51 +0000 |
parents | e7c8e64c2fdd |
children | 317bf47b506c |
comparison
equal
deleted
inserted
replaced
31:e7c8e64c2fdd | 32:91741bf3ab51 |
---|---|
43 plt.show() | 43 plt.show() |
44 sm.qqplot(x[xworst], line='s') | 44 sm.qqplot(x[xworst], line='s') |
45 plt.gca().set_title('Worst segment (most variance): %s'%xworst) | 45 plt.gca().set_title('Worst segment (most variance): %s'%xworst) |
46 plt.show() | 46 plt.show() |
47 | 47 |
48 def plot_x(block=True): | 48 def plot_x(sort=False,block=True): |
49 plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments') | 49 # Make these two subplots... |
50 plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means') | 50 if sort: |
51 plt.plot(all,'rx',label='Rank correlation of segment x whole crawl') | 51 aso=np.argsort(-all) |
52 plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl') | 52 plot_all=all[aso] |
53 plt.axis([0,99,0.8,1.0]) | 53 plot_x=np.array([xd[i].mean for i in range(N)])[aso] |
54 else: | |
55 plot_all=all | |
56 plot_x=[xd[i].mean for i in range(N)] | |
57 plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl') | |
58 plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl') | |
59 plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') | |
60 plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') | |
61 plt.axis([0,N-1,0.8,1.0]) | |
54 plt.legend(loc='best') | 62 plt.legend(loc='best') |
55 plt.grid(True) | 63 plt.grid(True) |
56 plt.show(block=block) | 64 plt.show(block=block) |
57 | 65 |
58 def hist(): | 66 def hist(): |
59 sdd=[(i,xm-(i*xsd)) for i in range(-2,3)] | 67 sdd=[(i,xm-(i*xsd)) for i in range(-2,3)] |
60 fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 | 68 fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 |
61 sdax=hax.twiny() | 69 sdax=hax.twiny() |
62 hax.hist([xd[i].mean for i in range(100)],color='lightblue') | 70 hax.hist([xd[i].mean for i in range(N)],color='lightblue') |
63 hax.set_title('Mean of rank correlation of each segment x all other segments') | 71 hax.set_title('Mean of rank correlation of each segment x all other segments') |
64 for s,v in sdd: | 72 for s,v in sdd: |
65 sdax.plot([v,v],[0,18],'b') | 73 sdax.plot([v,v],[0,18],'b') |
66 sdax.set_xlim(hax.get_xlim()) | 74 sdax.set_xlim(hax.get_xlim()) |
67 sdax.set_ylim(hax.get_ylim()) | 75 sdax.set_ylim(hax.get_ylim()) |
80 # Combine segment measures: | 88 # Combine segment measures: |
81 # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement | 89 # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement |
82 # convert to ranks, smallest value == highest rank | 90 # convert to ranks, smallest value == highest rank |
83 all_ranked=stats.rankdata(-all,method='average') # invert since | 91 all_ranked=stats.rankdata(-all,method='average') # invert since |
84 # large corr is good | 92 # large corr is good |
85 x_variance_ranked=stats.rankdata([xd[i].variance for i in range(100)]) | 93 x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)]) |
86 # small corr variance is good | 94 # small corr variance is good |
87 x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(100)]) | 95 x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)]) |
88 # invert since | 96 # invert since |
89 # large mean corr is good | 97 # large mean corr is good |
90 fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(100)]) | 98 fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)]) |
91 # invert since | 99 # invert since |
92 # large first diff is good | 100 # large first diff is good |
93 return np.array([[i, | 101 return np.array([[i, |
94 all_ranked[i], | 102 all_ranked[i], |
95 x_variance_ranked[i], | 103 x_variance_ranked[i], |
96 x_mean_ranked[i], | 104 x_mean_ranked[i], |
97 fd_ranked[i]] for i in range(100)]) | 105 fd_ranked[i]] for i in range(N)]) |
98 | 106 |
99 counts=loadtxt(sys.argv[1]+".csv",delimiter=',') | 107 counts=loadtxt(sys.argv[1]+".csv",delimiter=',') |
108 N=counts.shape[0] | |
100 # "If axis=0 (default), then each column represents a variable, with | 109 # "If axis=0 (default), then each column represents a variable, with |
101 # observations in the rows" | 110 # observations in the rows" |
102 # So each column is a sequence of counts, for whole crawl in column 0 | 111 # So each column is a sequence of counts, for whole crawl in column 0 |
103 # and for segments 0--99 in columns 1--100 | 112 # and for segments 0--N-1 in columns 1--N |
104 corr=stats.spearmanr(counts,nan_policy='omit').correlation | 113 corr=stats.spearmanr(counts,nan_policy='omit').correlation |
105 | 114 |
106 all=corr[0][1:] | 115 all=corr[0][1:] |
107 all_s=stats.describe(all) | 116 all_s=stats.describe(all) |
108 all_m=all_s.mean | 117 all_m=all_s.mean |
109 | 118 |
110 x=np.array([np.concatenate((corr[i][1:i], | 119 x=np.array([np.concatenate((corr[i][1:i], |
111 corr[i][i+1:])) for i in range(1,101)]) | 120 corr[i][i+1:])) for i in range(1,N+1)]) |
112 # The above, although transposed, works because the correlation matrix | 121 # The above, although transposed, works because the correlation matrix |
113 # is symmetric | 122 # is symmetric |
114 xd=[stats.describe(x[i]) for i in range(100)] | 123 xd=[stats.describe(x[i]) for i in range(N)] |
115 xs=stats.describe(np.array([xd[i].mean for i in range(100)])) | 124 xs=stats.describe(np.array([xd[i].mean for i in range(N)])) |
116 xm=xs.mean | 125 xm=xs.mean |
117 xsd=np.sqrt(xs.variance) | 126 xsd=np.sqrt(xs.variance) |
118 | 127 |
119 x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,101)] | 128 x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)] |
120 | 129 |
121 ### I need to review rows, e.g. counts[0] is an array of 101 counts | 130 ### I need to review rows, e.g. counts[0] is an array of N+1 counts |
122 ### for the most common label in the complete crawl, | 131 ### for the most common label in the complete crawl, |
123 ### from the complete crawl and all the segments | 132 ### from the complete crawl and all the segments |
124 ### versus columns, e.g. counts[:,0] is an array of 100 decreasing counts | 133 ### versus columns, e.g. counts[:,0] is an array of N decreasing counts |
125 ### for all the labels in the complete crawl | 134 ### for all the labels in the complete crawl |