# HG changeset patch # User Henry S. Thompson # Date 1668684427 0 # Node ID c73ec9deabbe40ccc0cf8592c91222ed0380c0e7 # Parent 669a0b120d34a5a332ef4836fd808d8d68821195 comments and more care about rows vs. columns diff -r 669a0b120d34 -r c73ec9deabbe bin/spearman.py --- a/bin/spearman.py Wed Nov 16 19:52:50 2022 +0000 +++ b/bin/spearman.py Thu Nov 17 11:27:07 2022 +0000 @@ -3,12 +3,14 @@ First column is for whole crawl, then 100 columns for segs 0-99 Each row is counts for some property, e.g. mime-detected or tld - For example + For example, assuming all.tsv has the whole-crawl warc-only counts + and s...tsv have the segment counts, all with counts in column 1, tr -d ',' all_100.csv - will produce such a file with 100 rows assuming all.tsv has the whole-crawl - warc-only counts and s...tsv have the segment counts, all counts in column 1 + will produce such a file with + * 100 rows, one for each of the top 100 counts + * 101 columns, 0 for all and 1--100 for segs 0--99 Usage: python3 -i spearman.py name where name.csv has the input @@ -82,7 +84,8 @@ counts=loadtxt(sys.argv[1]+".csv",delimiter=',') # "If axis=0 (default), then each column represents a variable, with # observations in the rows" -ranks=[stats.rankdata(-counts[i],method='average') for for i in range(1,100)] +# So each column is a sequence of counts, for whole crawl in column 0 +# and for segments 0--99 in columns 1--100 corr=stats.spearmanr(counts,nan_policy='omit').correlation all=corr[0][1:] @@ -91,11 +94,15 @@ x=np.array([np.concatenate((corr[i][1:i], corr[i][i+1:])) for i in range(1,101)]) +# The above, although transposed, works because the correlation matrix +# is symmetric xd=[stats.describe(x[i]) for i in range(100)] xs=stats.describe(np.array([xd[i].mean for i in range(100)])) xm=xs.mean xsd=np.sqrt(xs.variance) +ranks=[stats.rankdata(-counts[:,i],method='average') for for i in range(1,100)] + ### I need to review rows, e.g. counts[0] is an array of 101 counts ### for the most common label in the complete crawl, ### from the complete crawl and all the segments