Mercurial > hg > cc > cirrus_work

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/ccstats.py	Sun Nov 26 21:24:38 2023 +0000
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+'''Rank self.correlation processing for a csv tabulation of self.counts by segment
+   First column is for whole crawl, then 100 columns for segs 0-99
+   Each row is self.counts for some property, e.g. mime-detected or tld
+
+   For example, assuming ALL.tsv has the whole-crawl warc-only self.counts
+   and s...tsv have the segment self.counts, ALL with self.counts in column 1,
+
+   tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
+
+   will produce such a file with
+     * 100 rows, one for each of the top 100 self.counts
+     * 101 columns, 0 for ALL and 1--100 for segs 0--99
+
+   Usage: python3 -i spearman.py name id
+     where name.csv has the input
+'''
+
+import numpy as np
+from numpy import loadtxt
+from scipy import stats
+import statsmodels.api as sm
+import matplotlib.pyplot as plt
+import pylab
+
+import sys, math
+
+class CCStats:
+  def __init__(self, file, id):
+    self.file = file
+    self.id = id
+    self.counts=loadtxt(file+".csv",delimiter=',')
+    self.N=self.counts.shape[1]-1
+    # "If axis=0 (default), then each column represents a variable, with
+    #        observations in the rows"
+    # So each column is a sequence of self.counts, for whole crawl in column 0
+    #   and for segments 0--self.N-1 in columns 1--self.N
+    self.corr=stats.spearmanr(self.counts,nan_policy='omit').correlation
+
+    self.all=self.corr[0][1:]
+    self.all_s=stats.describe(self.all)
+    self.all_m=self.all_s.mean
+
+    self.x=np.array([np.concatenate((self.corr[i][1:i],
+                                self.corr[i][i+1:])) for i in range(1,self.N+1)])
+    # The above, although transposed, works because the correlation matrix
+    #  is symmetric
+    self.xd=[stats.describe(self.x[i]) for i in range(self.N)]
+    self.xs=stats.describe(np.array([self.xd[i].mean for i in range(self.N)]))
+    self.xm=self.xs.mean
+    self.xsd=np.sqrt(self.xs.variance)
+
+    self.x_ranks=[stats.rankdata(-self.counts[:,i],method='average') for i in range(1,self.N+1)]
+
+    self.aa = self.ranks()
+    self.aa_by_all=self.aa[self.aa[:,1].argsort()]
+    self.iranks_by_all = [int(r) for r in self.aa_by_all[:,0]]
+
+  def qqa(self):
+    # q-q plot for the whole crawl
+    sm.qqplot(self.all, line='s')
+    plt.gca().set_title('Rank self.correlation per segment wrt whole archive %s'%id)
+    plt.show()
+
+  def qqs(self):
+    # q-q plots for the best and worst (by variance) segments
+    global xv, xworst, xbest
+    xv=[d.variance for d in self.xd]
+    xworst=xv.index(max(xv))
+    xbest=xv.index(min(xv))
+    print(xbest,xworst)
+    sm.qqplot(self.x[xbest], line='s')
+    plt.gca().set_title('Best segment (least variance): %s'%xbest)
+    plt.show()
+    sm.qqplot(self.x[xworst], line='s')
+    plt.gca().set_title('Worst segment (most variance): %s'%xworst)
+    plt.show()
+
+  def plot_x(self,sort=False,block=True,all_only=True,title=None):
+    # Make these two subplots, w. and w/o sorting
+    # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
+    #  for legend hacking
+    if sort:
+      aso=np.argsort(-self.all)
+      plot_all=self.all[aso]
+      plot_x=np.array([self.xd[i].mean for i in range(self.N)])[aso]
+    else:
+      plot_all=self.all
+      plot_x=[self.xd[i].mean for i in range(self.N)]
+    if title is None:
+      l1='Rank self.correlation of segment self.x whole crawl'
+      l2='Mean of segment self.x whole crawl'
+      plt.legend(loc='best',fontsize='small')
+    else:
+      l1=l2=None
+    plt.plot(plot_all,'rx',label=l1)
+    plt.plot([0,self.N-1],[self.all_m,self.all_m],'r',label=l2)
+    if not(all_only):
+      plt.plot(plot_x,'bx',label='Mean of rank self.correlation of each segment self.x self.all other segments')
+      plt.plot([0,self.N-1],[self.xm,self.xm],'b',label='Mean of segment self.x segment means')
+    plt.axis([0,self.N-1,0.85 if all_only else 0.8,1.0])
+    plt.grid(True)
+    if title is not None:
+      plt.title(title)
+    plt.show(block=block)
+
+  def hist_x(self,align='mid'):
+    self.hist(self.xm,self.xsd,[self.xd[i].mean for i in range(self.N)],
+         'Mean of rank self.correlation of each segment self.x self.all other segments',
+         align)
+
+  def hist_all(self,align='mid'):
+    self.hist(self.all_m,np.sqrt(self.all_s.variance),self.all,
+         'Rank self.correlation of each segment self.x whole crawl %s'%id,
+         align)
+
+  def plot_ci(self,rhos,n,trim=None,conf=0.95):
+     # rhos are (rank) correlation values
+     rhos_s=rhos[(-rhos).argsort()]
+     if trim is None:
+       l=len(rhos)
+     else:
+       rhos_s=rhos_s[:trim]
+       l=trim
+     cc=(np.array([self.ci(r,n,conf) for r in rhos_s])).T
+     ue=cc[1]-rhos_s
+     le=rhos_s-cc[0]
+     #for i in range(len(rhos)):
+       #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
+     plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
+     plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100))
+     plt.show()
+
+  def ranks(self):
+    # Combine segment measures:
+    #  segID,rank self.corr. wrt self.all,inverse variance, mean cross rank self.corr.,first disagreement
+    # convert to ranks, smallest value == highest rank
+    all_ranked=stats.rankdata(-self.all,method='average') # invert since
+                                                     #  large self.corr is good
+    x_variance_ranked=stats.rankdata([self.xd[i].variance for i in range(self.N)])
+                                                    # small self.corr variance is good
+    x_mean_ranked=stats.rankdata([-(self.xd[i].mean) for i in range(self.N)])
+                                                     # invert since
+                                                     #  large mean self.corr is good
+    fd_ranked=stats.rankdata([-first_diff(self.x_ranks[i]) for i in range(self.N)])
+                                                     # invert since
+                                                     #  large first diff is good
+    return np.array([[i,
+                      all_ranked[i],
+                      x_variance_ranked[i],
+                      x_mean_ranked[i],
+                      fd_ranked[i]] for i in range(self.N)])
+
+  def hist(m,sd,hh,title,align):
+    sdd=[(i,m-(i*sd)) for i in range(-2,3)]
+    fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
+    sdax=hax.twiny()
+    hax.hist(hh,color='lightblue',align=align)
+    hax.set_title(title)
+    for s,v in sdd:
+         sdax.plot([v,v],[0,18],'b')
+    sdax.set_xlim(hax.get_xlim())
+    sdax.set_ylim(hax.get_ylim())
+    sdax.set_xticks([v for s,v in sdd])
+    sdax.set_xticklabels([str(s) for s,v in sdd])
+    plt.show()
+
+def first_diff(ranks):
+  # first disagreement with baseline == {1,2,...}
+  for i in range(len(ranks)):
+    if ranks[i]!=i+1.0:
+      return i
+  return i+1
+
+def ci(rho,n,conf=0.95):
+  # Courtesy of https://stats.stackexchange.com/a/18904
+  # rho is (rank) correlation, n is sample size
+  stderr=1.0/math.sqrt(n-3)
+  z=stats.norm.ppf(1.0-((1.0-conf)/2))
+  delta=z*stderr
+  lower=math.tanh(math.atanh(rho)-delta)
+  upper=math.tanh(math.atanh(rho)+delta)
+  return (lower,upper)
+
+if __name__ == '__main__':
+  S = CCStats(*sys.argv[1:])
+
+### I need to review rows, e.g. self.counts[0] is an array of self.N+1 self.counts
+###   for the most common label in the complete crawl,
+###   from the complete crawl and all the segments
+### versus columns, e.g. self.counts[:,0] is an array of self.N decreasing self.counts
+###   for all the labels in the complete crawl
--- a/lib/python/cc/stats.py	Fri Nov 24 20:41:03 2023 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,191 +0,0 @@
-#!/usr/bin/env python3
-'''Rank self.correlation processing for a csv tabulation of self.counts by segment
-   First column is for whole crawl, then 100 columns for segs 0-99
-   Each row is self.counts for some property, e.g. mime-detected or tld
-
-   For example, assuming ALL.tsv has the whole-crawl warc-only self.counts
-   and s...tsv have the segment self.counts, ALL with self.counts in column 1,
-
-   tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv
-
-   will produce such a file with
-     * 100 rows, one for each of the top 100 self.counts
-     * 101 columns, 0 for ALL and 1--100 for segs 0--99
-
-   Usage: python3 -i spearman.py name id
-     where name.csv has the input
-'''
-
-import numpy as np
-from numpy import loadtxt
-from scipy import stats
-import statsmodels.api as sm
-import matplotlib.pyplot as plt
-import pylab
-
-import sys, math
-
-class CCStats:
-  def __init__(self, file, id):
-    self.file = file
-    self.id = id
-    self.counts=loadtxt(file+".csv",delimiter=',')
-    self.N=self.counts.shape[1]-1
-    # "If axis=0 (default), then each column represents a variable, with
-    #        observations in the rows"
-    # So each column is a sequence of self.counts, for whole crawl in column 0
-    #   and for segments 0--self.N-1 in columns 1--self.N
-    self.corr=stats.spearmanr(self.counts,nan_policy='omit').self.correlation
-
-    self.all=self.corr[0][1:]
-    self.all_s=stats.describe(self.all)
-    self.all_m=self.all_s.mean
-
-    self.x=np.array([np.concatenate((self.corr[i][1:i],
-                                self.corr[i][i+1:])) for i in range(1,self.N+1)])
-    # The above, although transposed, works because the self.correlation matrix
-    #  is symmetric
-    self.xd=[stats.describe(self.x[i]) for i in range(self.N)]
-    self.xs=stats.describe(np.array([self.xd[i].mean for i in range(self.N)]))
-    self.xm=self.xs.mean
-    self.xsd=np.sqrt(self.xs.variance)
-
-    self.x_ranks=[stats.rankdata(-self.counts[:,i],method='average') for i in range(1,self.N+1)]
-
-    self.aa=ranks()
-    self.aa_by_all=self.aa[self.aa[:,1].argsort()]
-
-  def qqa(self):
-    # q-q plot for the whole crawl
-    sm.qqplot(self.all, line='s')
-    plt.gca().set_title('Rank self.correlation per segment wrt whole archive %s'%id)
-    plt.show()
-
-  def qqs(self):
-    # q-q plots for the best and worst (by variance) segments
-    global xv, xworst, xbest
-    xv=[d.variance for d in self.xd]
-    xworst=xv.index(max(xv))
-    xbest=xv.index(min(xv))
-    print(xbest,xworst)
-    sm.qqplot(self.x[xbest], line='s')
-    plt.gca().set_title('Best segment (least variance): %s'%xbest)
-    plt.show()
-    sm.qqplot(self.x[xworst], line='s')
-    plt.gca().set_title('Worst segment (most variance): %s'%xworst)
-    plt.show()
-
-  def plot_x(sort=False,block=True,all_only=True,title=None):
-    # Make these two subplots, w. and w/o sorting
-    # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
-    #  for legend hacking
-    if sort:
-      aso=np.argsort(-self.all)
-      plot_all=self.all[aso]
-      plot_x=np.array([self.xd[i].mean for i in range(self.N)])[aso]
-    else:
-      plot_all=self.all
-      plot_x=[self.xd[i].mean for i in range(self.N)]
-    if title is None:
-      l1='Rank self.correlation of segment self.x whole crawl'
-      l2='Mean of segment self.x whole crawl'
-      plt.legend(loc='best',fontsize='small')
-    else:
-      l1=l2=None
-    plt.plot(plot_all,'rx',label=l1)
-    plt.plot([0,self.N-1],[self.all_m,self.all_m],'r',label=l2)
-    if not(all_only):
-      plt.plot(plot_x,'bx',label='Mean of rank self.correlation of each segment self.x self.all other segments')
-      plt.plot([0,self.N-1],[self.xm,self.xm],'b',label='Mean of segment self.x segment means')
-    plt.axis([0,self.N-1,0.85 if all_only else 0.8,1.0])
-    plt.grid(True)
-    if title is not None:
-      plt.title(title)
-    plt.show(block=block)
-
-  def hist_x(align='mid'):
-    hist(self.xm,self.xsd,[self.xd[i].mean for i in range(self.N)],
-         'Mean of rank self.correlation of each segment self.x self.all other segments',
-         align)
-
-  def hist_all(align='mid'):
-    hist(self.all_m,np.sqrt(self.all_s.variance),self.all,
-         'Rank self.correlation of each segment self.x whole crawl %s'%id,
-         align)
-
-  def hist(m,sd,hh,title,align):
-    sdd=[(i,m-(i*sd)) for i in range(-2,3)]
-    fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
-    sdax=hax.twiny()
-    hax.hist(hh,color='lightblue',align=align)
-    hax.set_title(title)
-    for s,v in sdd:
-         sdax.plot([v,v],[0,18],'b')
-    sdax.set_xlim(hax.get_xlim())
-    sdax.set_ylim(hax.get_ylim())
-    sdax.set_xticks([v for s,v in sdd])
-    sdax.set_xticklabels([str(s) for s,v in sdd])
-    plt.show()
-
-  def ci(rho,n,conf=0.95):
-    # Courtesy of https://stats.stackexchange.com/a/18904
-    # rho is (rank) self.correlation, n is sample size
-    stderr=1.0/math.sqrt(n-3)
-    z=stats.norm.ppf(1.0-((1.0-conf)/2))
-    delta=z*stderr
-    lower=math.tanh(math.atanh(rho)-delta)
-    upper=math.tanh(math.atanh(rho)+delta)
-    return (lower,upper)
-
-  def plot_ci(rhos,n,trim=None,conf=0.95):
-     # rhos are (rank) self.correlation values
-     rhos_s=rhos[(-rhos).argsort()]
-     if trim is None:
-       l=len(rhos)
-     else:
-       rhos_s=rhos_s[:trim]
-       l=trim
-     cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
-     ue=cc[1]-rhos_s
-     le=rhos_s-cc[0]
-     #for i in range(len(rhos)):
-       #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
-     plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
-     plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100))
-     plt.show()
-
-  def first_diff(ranks):
-    # first disagreement with baseline == {1,2,...}
-    for i in range(len(ranks)):
-      if ranks[i]!=i+1.0:
-        return i
-    return i+1
-
-  def ranks():
-    # Combine segment measures:
-    #  segID,rank self.corr. wrt self.all,inverse variance, mean cross rank self.corr.,first disagreement
-    # convert to ranks, smallest value == highest rank
-    all_ranked=stats.rankdata(-self.all,method='average') # invert since
-                                                     #  large self.corr is good
-    x_variance_ranked=stats.rankdata([self.xd[i].variance for i in range(self.N)])
-                                                    # small self.corr variance is good
-    x_mean_ranked=stats.rankdata([-(self.xd[i].mean) for i in range(self.N)])
-                                                     # invert since
-                                                     #  large mean self.corr is good
-    fd_ranked=stats.rankdata([-first_diff(self.x_ranks[i]) for i in range(self.N)])
-                                                     # invert since
-                                                     #  large first diff is good
-    return np.array([[i,
-                      all_ranked[i],
-                      x_variance_ranked[i],
-                      x_mean_ranked[i],
-                      fd_ranked[i]] for i in range(self.N)])
-
-if __name__ == '__main__':
-  S = CCStats(*sys.argv[1:])
-
-### I need to review rows, e.g. self.counts[0] is an array of self.N+1 self.counts
-###   for the most common label in the complete crawl,
-###   from the complete crawl and all the segments
-### versus columns, e.g. self.counts[:,0] is an array of self.N decreasing self.counts
-###   for all the labels in the complete crawl