changeset 30:c73ec9deabbe

comments and more care about rows vs. columns
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 17 Nov 2022 11:27:07 +0000
parents 669a0b120d34
children e7c8e64c2fdd
files bin/spearman.py
diffstat 1 files changed, 11 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/bin/spearman.py	Wed Nov 16 19:52:50 2022 +0000
+++ b/bin/spearman.py	Thu Nov 17 11:27:07 2022 +0000
@@ -3,12 +3,14 @@
    First column is for whole crawl, then 100 columns for segs 0-99
    Each row is counts for some property, e.g. mime-detected or tld
 
-   For example
+   For example, assuming all.tsv has the whole-crawl warc-only counts
+   and s...tsv have the segment counts, all with counts in column 1,
 
    tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
 
-   will produce such a file with 100 rows assuming all.tsv has the whole-crawl
-   warc-only counts and s...tsv have the segment counts, all counts in column 1
+   will produce such a file with
+     * 100 rows, one for each of the top 100 counts
+     * 101 columns, 0 for all and 1--100 for segs 0--99
 
    Usage: python3 -i spearman.py name
      where name.csv has the input
@@ -82,7 +84,8 @@
 counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
 # "If axis=0 (default), then each column represents a variable, with
 #        observations in the rows"
-ranks=[stats.rankdata(-counts[i],method='average') for for i in range(1,100)]
+# So each column is a sequence of counts, for whole crawl in column 0
+#   and for segments 0--99 in columns 1--100
 corr=stats.spearmanr(counts,nan_policy='omit').correlation
 
 all=corr[0][1:]
@@ -91,11 +94,15 @@
 
 x=np.array([np.concatenate((corr[i][1:i],
                             corr[i][i+1:])) for i in range(1,101)])
+# The above, although transposed, works because the correlation matrix
+#  is symmetric
 xd=[stats.describe(x[i]) for i in range(100)]
 xs=stats.describe(np.array([xd[i].mean for i in range(100)]))
 xm=xs.mean
 xsd=np.sqrt(xs.variance)
 
+ranks=[stats.rankdata(-counts[:,i],method='average') for for i in range(1,100)]
+
 ### I need to review rows, e.g. counts[0] is an array of 101 counts
 ###   for the most common label in the complete crawl,
 ###   from the complete crawl and all the segments