Mercurial > hg > cc > cirrus_work
changeset 156:adb1e22ad708
avoid global name conflict
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 24 Oct 2023 14:26:36 +0100 |
parents | 56825fc8459d |
children | 463fc7b09119 |
files | lib/python/cc/spearman.py |
diffstat | 1 files changed, 17 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/spearman.py Wed Oct 11 12:51:06 2023 +0100 +++ b/lib/python/cc/spearman.py Tue Oct 24 14:26:36 2023 +0100 @@ -3,14 +3,14 @@ First column is for whole crawl, then 100 columns for segs 0-99 Each row is counts for some property, e.g. mime-detected or tld - For example, assuming all.tsv has the whole-crawl warc-only counts - and s...tsv have the segment counts, all with counts in column 1, + For example, assuming ALL.tsv has the whole-crawl warc-only counts + and s...tsv have the segment counts, ALL with counts in column 1, - tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv + tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv will produce such a file with * 100 rows, one for each of the top 100 counts - * 101 columns, 0 for all and 1--100 for segs 0--99 + * 101 columns, 0 for ALL and 1--100 for segs 0--99 Usage: python3 -i spearman.py name id where name.csv has the input @@ -27,7 +27,7 @@ def qqa(): # q-q plot for the whole crawl - sm.qqplot(all, line='s') + sm.qqplot(ALL, line='s') plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id) plt.show() @@ -50,11 +50,11 @@ # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot # for legend hacking if sort: - aso=np.argsort(-all) - plot_all=all[aso] + aso=np.argsort(-ALL) + plot_all=ALL[aso] plot_x=np.array([xd[i].mean for i in range(N)])[aso] else: - plot_all=all + plot_all=ALL plot_x=[xd[i].mean for i in range(N)] if title is None: l1='Rank correlation of segment x whole crawl' @@ -65,7 +65,7 @@ plt.plot(plot_all,'rx',label=l1) plt.plot([0,N-1],[all_m,all_m],'r',label=l2) if not(all_only): - plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments') + plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x ALL other segments') plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means') plt.axis([0,N-1,0.85 if all_only else 0.8,1.0]) plt.grid(True) @@ -75,11 +75,11 @@ def hist_x(align='mid'): hist(xm,xsd,[xd[i].mean for i in range(N)], - 'Mean of rank correlation of each segment x all other segments', + 'Mean of rank correlation of each segment x ALL other segments', align) def hist_all(align='mid'): - hist(all_m,np.sqrt(all_s.variance),all, + hist(all_m,np.sqrt(all_s.variance),ALL, 'Rank correlation of each segment x whole crawl %s'%id, align) @@ -133,9 +133,9 @@ def ranks(): # Combine segment measures: - # segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement + # segID,rank corr. wrt ALL,inverse variance, mean cross rank corr.,first disagreement # convert to ranks, smallest value == highest rank - all_ranked=stats.rankdata(-all,method='average') # invert since + all_ranked=stats.rankdata(-ALL,method='average') # invert since # large corr is good x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)]) # small corr variance is good @@ -152,7 +152,7 @@ fd_ranked[i]] for i in range(N)]) def main(): - global counts, id, corr, all, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr + global counts, id, corr, ALL, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr global aa, aa_by_all, N counts=loadtxt(sys.argv[1]+".csv",delimiter=',') id=sys.argv[2] @@ -163,8 +163,8 @@ # and for segments 0--N-1 in columns 1--N corr=stats.spearmanr(counts,nan_policy='omit').correlation - all=corr[0][1:] - all_s=stats.describe(all) + ALL=corr[0][1:] + all_s=stats.describe(ALL) all_m=all_s.mean x=np.array([np.concatenate((corr[i][1:i], @@ -183,6 +183,6 @@ ### I need to review rows, e.g. counts[0] is an array of N+1 counts ### for the most common label in the complete crawl, -### from the complete crawl and all the segments +### from the complete crawl and ALL the segments ### versus columns, e.g. counts[:,0] is an array of N decreasing counts ### for all the labels in the complete crawl