# HG changeset patch # User Henry S. Thompson # Date 1668628370 0 # Node ID 669a0b120d34a5a332ef4836fd808d8d68821195 # Parent 7ffb686ca0604b536baef52c3a4905ff02961a5e start work on ranking, lose faith in getting row vs. column correct every time diff -r 7ffb686ca060 -r 669a0b120d34 bin/spearman.py --- a/bin/spearman.py Wed Nov 16 17:29:55 2022 +0000 +++ b/bin/spearman.py Wed Nov 16 19:52:50 2022 +0000 @@ -67,17 +67,37 @@ sdax.set_xticklabels([str(s) for s,v in sdd]) plt.show() +def first_diff(ranks): + # first disagreement with baseline == {1,2,...} + for i in range(len(ranks)): + if ranks[i]!=i+1.0: + return i + return i+1 + +def ranks(): + # Combine segment measures: + # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement + return np.array([i,all[i],1.0/xd[i].variance,xd[i].mean,first_diff(ranks[i])]) + counts=loadtxt(sys.argv[1]+".csv",delimiter=',') -o=stats.spearmanr(counts,nan_policy='omit') +# "If axis=0 (default), then each column represents a variable, with +# observations in the rows" +ranks=[stats.rankdata(-counts[i],method='average') for for i in range(1,100)] +corr=stats.spearmanr(counts,nan_policy='omit').correlation -all=o.correlation[0][1:] +all=corr[0][1:] all_s=stats.describe(all) all_m=all_s.mean -# Should get the confidence interval for this, so we can -# use it in plot_x -x=np.array([np.concatenate((o.correlation[i][1:i],o.correlation[i][i+1:])) for i in range(1,101)]) +x=np.array([np.concatenate((corr[i][1:i], + corr[i][i+1:])) for i in range(1,101)]) xd=[stats.describe(x[i]) for i in range(100)] xs=stats.describe(np.array([xd[i].mean for i in range(100)])) xm=xs.mean xsd=np.sqrt(xs.variance) + +### I need to review rows, e.g. counts[0] is an array of 101 counts +### for the most common label in the complete crawl, +### from the complete crawl and all the segments +### versus columns, e.g. counts[:,0] is an array of 100 decreasing counts +### for all the labels in the complete crawl