Mercurial > hg > cc > cirrus_work

--- a/lib/python/cc/spearman.py	Wed Oct 11 12:51:06 2023 +0100
+++ b/lib/python/cc/spearman.py	Tue Oct 24 14:26:36 2023 +0100
@@ -3,14 +3,14 @@
    First column is for whole crawl, then 100 columns for segs 0-99
    Each row is counts for some property, e.g. mime-detected or tld

-   For example, assuming all.tsv has the whole-crawl warc-only counts
-   and s...tsv have the segment counts, all with counts in column 1,
+   For example, assuming ALL.tsv has the whole-crawl warc-only counts
+   and s...tsv have the segment counts, ALL with counts in column 1,

-   tr -d ',' <all.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
+   tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv

    will produce such a file with
      * 100 rows, one for each of the top 100 counts
-     * 101 columns, 0 for all and 1--100 for segs 0--99
+     * 101 columns, 0 for ALL and 1--100 for segs 0--99

    Usage: python3 -i spearman.py name id
      where name.csv has the input
@@ -27,7 +27,7 @@

 def qqa():
   # q-q plot for the whole crawl
-  sm.qqplot(all, line='s')
+  sm.qqplot(ALL, line='s')
   plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id)
   plt.show()

@@ -50,11 +50,11 @@
   # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
   #  for legend hacking
   if sort:
-    aso=np.argsort(-all)
-    plot_all=all[aso]
+    aso=np.argsort(-ALL)
+    plot_all=ALL[aso]
     plot_x=np.array([xd[i].mean for i in range(N)])[aso]
   else:
-    plot_all=all
+    plot_all=ALL
     plot_x=[xd[i].mean for i in range(N)]
   if title is None:
     l1='Rank correlation of segment x whole crawl'
@@ -65,7 +65,7 @@
   plt.plot(plot_all,'rx',label=l1)
   plt.plot([0,N-1],[all_m,all_m],'r',label=l2)
   if not(all_only):
-    plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
+    plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x ALL other segments')
     plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
   plt.axis([0,N-1,0.85 if all_only else 0.8,1.0])
   plt.grid(True)
@@ -75,11 +75,11 @@

 def hist_x(align='mid'):
   hist(xm,xsd,[xd[i].mean for i in range(N)],
-       'Mean of rank correlation of each segment x all other segments',
+       'Mean of rank correlation of each segment x ALL other segments',
        align)

 def hist_all(align='mid'):
-  hist(all_m,np.sqrt(all_s.variance),all,
+  hist(all_m,np.sqrt(all_s.variance),ALL,
        'Rank correlation of each segment x whole crawl %s'%id,
        align)

@@ -133,9 +133,9 @@

 def ranks():
   # Combine segment measures:
-  #  segID,rank corr. wrt all,inverse variance, mean cross rank corr.,first disagreement
+  #  segID,rank corr. wrt ALL,inverse variance, mean cross rank corr.,first disagreement
   # convert to ranks, smallest value == highest rank
-  all_ranked=stats.rankdata(-all,method='average') # invert since
+  all_ranked=stats.rankdata(-ALL,method='average') # invert since
                                                    #  large corr is good
   x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)])
                                                   # small corr variance is good
@@ -152,7 +152,7 @@
                     fd_ranked[i]] for i in range(N)])

 def main():
-  global counts, id, corr, all, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr
+  global counts, id, corr, ALL, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr
   global aa, aa_by_all, N
   counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
   id=sys.argv[2]
@@ -163,8 +163,8 @@
   #   and for segments 0--N-1 in columns 1--N
   corr=stats.spearmanr(counts,nan_policy='omit').correlation

-  all=corr[0][1:]
-  all_s=stats.describe(all)
+  ALL=corr[0][1:]
+  all_s=stats.describe(ALL)
   all_m=all_s.mean

   x=np.array([np.concatenate((corr[i][1:i],
@@ -183,6 +183,6 @@

 ### I need to review rows, e.g. counts[0] is an array of N+1 counts
 ###   for the most common label in the complete crawl,
-###   from the complete crawl and all the segments
+###   from the complete crawl and ALL the segments
 ### versus columns, e.g. counts[:,0] is an array of N decreasing counts
 ###   for all the labels in the complete crawl