comparison bin/spearman.py @ 32:91741bf3ab51

add sort flag to plot_x
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 22 Nov 2022 11:02:51 +0000
parents e7c8e64c2fdd
children 317bf47b506c
comparison
equal deleted inserted replaced
31:e7c8e64c2fdd 32:91741bf3ab51
43 plt.show() 43 plt.show()
44 sm.qqplot(x[xworst], line='s') 44 sm.qqplot(x[xworst], line='s')
45 plt.gca().set_title('Worst segment (most variance): %s'%xworst) 45 plt.gca().set_title('Worst segment (most variance): %s'%xworst)
46 plt.show() 46 plt.show()
47 47
48 def plot_x(block=True): 48 def plot_x(sort=False,block=True):
49 plt.plot([xd[i].mean for i in range(100)],'bx',label='Mean of rank correlation of each segment x all other segments') 49 # Make these two subplots...
50 plt.plot([0,99],[xm,xm],'b',label='Mean of segment x segment means') 50 if sort:
51 plt.plot(all,'rx',label='Rank correlation of segment x whole crawl') 51 aso=np.argsort(-all)
52 plt.plot([0,99],[all_m,all_m],'r',label='Mean of segment x whole crawl') 52 plot_all=all[aso]
53 plt.axis([0,99,0.8,1.0]) 53 plot_x=np.array([xd[i].mean for i in range(N)])[aso]
54 else:
55 plot_all=all
56 plot_x=[xd[i].mean for i in range(N)]
57 plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl')
58 plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl')
59 plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
60 plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
61 plt.axis([0,N-1,0.8,1.0])
54 plt.legend(loc='best') 62 plt.legend(loc='best')
55 plt.grid(True) 63 plt.grid(True)
56 plt.show(block=block) 64 plt.show(block=block)
57 65
58 def hist(): 66 def hist():
59 sdd=[(i,xm-(i*xsd)) for i in range(-2,3)] 67 sdd=[(i,xm-(i*xsd)) for i in range(-2,3)]
60 fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 68 fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
61 sdax=hax.twiny() 69 sdax=hax.twiny()
62 hax.hist([xd[i].mean for i in range(100)],color='lightblue') 70 hax.hist([xd[i].mean for i in range(N)],color='lightblue')
63 hax.set_title('Mean of rank correlation of each segment x all other segments') 71 hax.set_title('Mean of rank correlation of each segment x all other segments')
64 for s,v in sdd: 72 for s,v in sdd:
65 sdax.plot([v,v],[0,18],'b') 73 sdax.plot([v,v],[0,18],'b')
66 sdax.set_xlim(hax.get_xlim()) 74 sdax.set_xlim(hax.get_xlim())
67 sdax.set_ylim(hax.get_ylim()) 75 sdax.set_ylim(hax.get_ylim())
80 # Combine segment measures: 88 # Combine segment measures:
81 # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement 89 # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement
82 # convert to ranks, smallest value == highest rank 90 # convert to ranks, smallest value == highest rank
83 all_ranked=stats.rankdata(-all,method='average') # invert since 91 all_ranked=stats.rankdata(-all,method='average') # invert since
84 # large corr is good 92 # large corr is good
85 x_variance_ranked=stats.rankdata([xd[i].variance for i in range(100)]) 93 x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)])
86 # small corr variance is good 94 # small corr variance is good
87 x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(100)]) 95 x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)])
88 # invert since 96 # invert since
89 # large mean corr is good 97 # large mean corr is good
90 fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(100)]) 98 fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)])
91 # invert since 99 # invert since
92 # large first diff is good 100 # large first diff is good
93 return np.array([[i, 101 return np.array([[i,
94 all_ranked[i], 102 all_ranked[i],
95 x_variance_ranked[i], 103 x_variance_ranked[i],
96 x_mean_ranked[i], 104 x_mean_ranked[i],
97 fd_ranked[i]] for i in range(100)]) 105 fd_ranked[i]] for i in range(N)])
98 106
99 counts=loadtxt(sys.argv[1]+".csv",delimiter=',') 107 counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
108 N=counts.shape[0]
100 # "If axis=0 (default), then each column represents a variable, with 109 # "If axis=0 (default), then each column represents a variable, with
101 # observations in the rows" 110 # observations in the rows"
102 # So each column is a sequence of counts, for whole crawl in column 0 111 # So each column is a sequence of counts, for whole crawl in column 0
103 # and for segments 0--99 in columns 1--100 112 # and for segments 0--N-1 in columns 1--N
104 corr=stats.spearmanr(counts,nan_policy='omit').correlation 113 corr=stats.spearmanr(counts,nan_policy='omit').correlation
105 114
106 all=corr[0][1:] 115 all=corr[0][1:]
107 all_s=stats.describe(all) 116 all_s=stats.describe(all)
108 all_m=all_s.mean 117 all_m=all_s.mean
109 118
110 x=np.array([np.concatenate((corr[i][1:i], 119 x=np.array([np.concatenate((corr[i][1:i],
111 corr[i][i+1:])) for i in range(1,101)]) 120 corr[i][i+1:])) for i in range(1,N+1)])
112 # The above, although transposed, works because the correlation matrix 121 # The above, although transposed, works because the correlation matrix
113 # is symmetric 122 # is symmetric
114 xd=[stats.describe(x[i]) for i in range(100)] 123 xd=[stats.describe(x[i]) for i in range(N)]
115 xs=stats.describe(np.array([xd[i].mean for i in range(100)])) 124 xs=stats.describe(np.array([xd[i].mean for i in range(N)]))
116 xm=xs.mean 125 xm=xs.mean
117 xsd=np.sqrt(xs.variance) 126 xsd=np.sqrt(xs.variance)
118 127
119 x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,101)] 128 x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)]
120 129
121 ### I need to review rows, e.g. counts[0] is an array of 101 counts 130 ### I need to review rows, e.g. counts[0] is an array of N+1 counts
122 ### for the most common label in the complete crawl, 131 ### for the most common label in the complete crawl,
123 ### from the complete crawl and all the segments 132 ### from the complete crawl and all the segments
124 ### versus columns, e.g. counts[:,0] is an array of 100 decreasing counts 133 ### versus columns, e.g. counts[:,0] is an array of N decreasing counts
125 ### for all the labels in the complete crawl 134 ### for all the labels in the complete crawl