Mercurial > hg > cc > cirrus_work

--- a/lib/python/cc/stats.py	Fri Nov 24 20:40:09 2023 +0000
+++ b/lib/python/cc/stats.py	Fri Nov 24 20:41:03 2023 +0000
@@ -1,15 +1,15 @@
 #!/usr/bin/env python3
-'''Rank correlation processing for a csv tabulation of counts by segment
+'''Rank self.correlation processing for a csv tabulation of self.counts by segment
    First column is for whole crawl, then 100 columns for segs 0-99
-   Each row is counts for some property, e.g. mime-detected or tld
+   Each row is self.counts for some property, e.g. mime-detected or tld

-   For example, assuming ALL.tsv has the whole-crawl warc-only counts
-   and s...tsv have the segment counts, ALL with counts in column 1,
+   For example, assuming ALL.tsv has the whole-crawl warc-only self.counts
+   and s...tsv have the segment self.counts, ALL with self.counts in column 1,

    tr -d ',' <ALL.tsv |head -100 | while read n m; do printf "%s%s\n" $n $(for i in {0..99}; do printf ",%s" $({ grep -w "w    $m\$" s${i}.tsv || echo NaN ;} | cut -f 1 ) ; done ) ; done > all_100.csv

    will produce such a file with
-     * 100 rows, one for each of the top 100 counts
+     * 100 rows, one for each of the top 100 self.counts
      * 101 columns, 0 for ALL and 1--100 for segs 0--99

    Usage: python3 -i spearman.py name id
@@ -25,168 +25,167 @@

 import sys, math

-def qqa():
-  # q-q plot for the whole crawl
-  sm.qqplot(ALL, line='s')
-  plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id)
-  plt.show()
+class CCStats:
+  def __init__(self, file, id):
+    self.file = file
+    self.id = id
+    self.counts=loadtxt(file+".csv",delimiter=',')
+    self.N=self.counts.shape[1]-1
+    # "If axis=0 (default), then each column represents a variable, with
+    #        observations in the rows"
+    # So each column is a sequence of self.counts, for whole crawl in column 0
+    #   and for segments 0--self.N-1 in columns 1--self.N
+    self.corr=stats.spearmanr(self.counts,nan_policy='omit').self.correlation
+
+    self.all=self.corr[0][1:]
+    self.all_s=stats.describe(self.all)
+    self.all_m=self.all_s.mean

-def qqs():
-  # q-q plots for the best and worst (by variance) segments
-  global xv, xworst, xbest
-  xv=[d.variance for d in xd]
-  xworst=xv.index(max(xv))
-  xbest=xv.index(min(xv))
-  print(xbest,xworst)
-  sm.qqplot(x[xbest], line='s')
-  plt.gca().set_title('Best segment (least variance): %s'%xbest)
-  plt.show()
-  sm.qqplot(x[xworst], line='s')
-  plt.gca().set_title('Worst segment (most variance): %s'%xworst)
-  plt.show()
+    self.x=np.array([np.concatenate((self.corr[i][1:i],
+                                self.corr[i][i+1:])) for i in range(1,self.N+1)])
+    # The above, although transposed, works because the self.correlation matrix
+    #  is symmetric
+    self.xd=[stats.describe(self.x[i]) for i in range(self.N)]
+    self.xs=stats.describe(np.array([self.xd[i].mean for i in range(self.N)]))
+    self.xm=self.xs.mean
+    self.xsd=np.sqrt(self.xs.variance)
+
+    self.x_ranks=[stats.rankdata(-self.counts[:,i],method='average') for i in range(1,self.N+1)]
+
+    self.aa=ranks()
+    self.aa_by_all=self.aa[self.aa[:,1].argsort()]
+
+  def qqa(self):
+    # q-q plot for the whole crawl
+    sm.qqplot(self.all, line='s')
+    plt.gca().set_title('Rank self.correlation per segment wrt whole archive %s'%id)
+    plt.show()

-def plot_x(sort=False,block=True,all_only=True,title=None):
-  # Make these two subplots, w. and w/o sorting
-  # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
-  #  for legend hacking
-  if sort:
-    aso=np.argsort(-ALL)
-    plot_all=ALL[aso]
-    plot_x=np.array([xd[i].mean for i in range(N)])[aso]
-  else:
-    plot_all=ALL
-    plot_x=[xd[i].mean for i in range(N)]
-  if title is None:
-    l1='Rank correlation of segment x whole crawl'
-    l2='Mean of segment x whole crawl'
-    plt.legend(loc='best',fontsize='small')
-  else:
-    l1=l2=None
-  plt.plot(plot_all,'rx',label=l1)
-  plt.plot([0,N-1],[all_m,all_m],'r',label=l2)
-  if not(all_only):
-    plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x ALL other segments')
-    plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
-  plt.axis([0,N-1,0.85 if all_only else 0.8,1.0])
-  plt.grid(True)
-  if title is not None:
-    plt.title(title)
-  plt.show(block=block)
+  def qqs(self):
+    # q-q plots for the best and worst (by variance) segments
+    global xv, xworst, xbest
+    xv=[d.variance for d in self.xd]
+    xworst=xv.index(max(xv))
+    xbest=xv.index(min(xv))
+    print(xbest,xworst)
+    sm.qqplot(self.x[xbest], line='s')
+    plt.gca().set_title('Best segment (least variance): %s'%xbest)
+    plt.show()
+    sm.qqplot(self.x[xworst], line='s')
+    plt.gca().set_title('Worst segment (most variance): %s'%xworst)
+    plt.show()

-def hist_x(align='mid'):
-  hist(xm,xsd,[xd[i].mean for i in range(N)],
-       'Mean of rank correlation of each segment x ALL other segments',
-       align)
-
-def hist_all(align='mid'):
-  hist(all_m,np.sqrt(all_s.variance),ALL,
-       'Rank correlation of each segment x whole crawl %s'%id,
-       align)
-
-def hist(m,sd,hh,title,align):
-  sdd=[(i,m-(i*sd)) for i in range(-2,3)]
-  fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
-  sdax=hax.twiny()
-  hax.hist(hh,color='lightblue',align=align)
-  hax.set_title(title)
-  for s,v in sdd:
-       sdax.plot([v,v],[0,18],'b')
-  sdax.set_xlim(hax.get_xlim())
-  sdax.set_ylim(hax.get_ylim())
-  sdax.set_xticks([v for s,v in sdd])
-  sdax.set_xticklabels([str(s) for s,v in sdd])
-  plt.show()
+  def plot_x(sort=False,block=True,all_only=True,title=None):
+    # Make these two subplots, w. and w/o sorting
+    # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
+    #  for legend hacking
+    if sort:
+      aso=np.argsort(-self.all)
+      plot_all=self.all[aso]
+      plot_x=np.array([self.xd[i].mean for i in range(self.N)])[aso]
+    else:
+      plot_all=self.all
+      plot_x=[self.xd[i].mean for i in range(self.N)]
+    if title is None:
+      l1='Rank self.correlation of segment self.x whole crawl'
+      l2='Mean of segment self.x whole crawl'
+      plt.legend(loc='best',fontsize='small')
+    else:
+      l1=l2=None
+    plt.plot(plot_all,'rx',label=l1)
+    plt.plot([0,self.N-1],[self.all_m,self.all_m],'r',label=l2)
+    if not(all_only):
+      plt.plot(plot_x,'bx',label='Mean of rank self.correlation of each segment self.x self.all other segments')
+      plt.plot([0,self.N-1],[self.xm,self.xm],'b',label='Mean of segment self.x segment means')
+    plt.axis([0,self.N-1,0.85 if all_only else 0.8,1.0])
+    plt.grid(True)
+    if title is not None:
+      plt.title(title)
+    plt.show(block=block)

-def ci(rho,n,conf=0.95):
-  # Courtesy of https://stats.stackexchange.com/a/18904
-  # rho is (rank) correlation, n is sample size
-  stderr=1.0/math.sqrt(n-3)
-  z=stats.norm.ppf(1.0-((1.0-conf)/2))
-  delta=z*stderr
-  lower=math.tanh(math.atanh(rho)-delta)
-  upper=math.tanh(math.atanh(rho)+delta)
-  return (lower,upper)
+  def hist_x(align='mid'):
+    hist(self.xm,self.xsd,[self.xd[i].mean for i in range(self.N)],
+         'Mean of rank self.correlation of each segment self.x self.all other segments',
+         align)
+
+  def hist_all(align='mid'):
+    hist(self.all_m,np.sqrt(self.all_s.variance),self.all,
+         'Rank self.correlation of each segment self.x whole crawl %s'%id,
+         align)

-def plot_ci(rhos,n,trim=None,conf=0.95):
-   # rhos are (rank) correlation values
-   rhos_s=rhos[(-rhos).argsort()]
-   if trim is None:
-     l=len(rhos)
-   else:
-     rhos_s=rhos_s[:trim]
-     l=trim
-   cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
-   ue=cc[1]-rhos_s
-   le=rhos_s-cc[0]
-   #for i in range(len(rhos)):
-     #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
-   plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
-   plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100))
-   plt.show()
+  def hist(m,sd,hh,title,align):
+    sdd=[(i,m-(i*sd)) for i in range(-2,3)]
+    fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
+    sdax=hax.twiny()
+    hax.hist(hh,color='lightblue',align=align)
+    hax.set_title(title)
+    for s,v in sdd:
+         sdax.plot([v,v],[0,18],'b')
+    sdax.set_xlim(hax.get_xlim())
+    sdax.set_ylim(hax.get_ylim())
+    sdax.set_xticks([v for s,v in sdd])
+    sdax.set_xticklabels([str(s) for s,v in sdd])
+    plt.show()

-def first_diff(ranks):
-  # first disagreement with baseline == {1,2,...}
-  for i in range(len(ranks)):
-    if ranks[i]!=i+1.0:
-      return i
-  return i+1
+  def ci(rho,n,conf=0.95):
+    # Courtesy of https://stats.stackexchange.com/a/18904
+    # rho is (rank) self.correlation, n is sample size
+    stderr=1.0/math.sqrt(n-3)
+    z=stats.norm.ppf(1.0-((1.0-conf)/2))
+    delta=z*stderr
+    lower=math.tanh(math.atanh(rho)-delta)
+    upper=math.tanh(math.atanh(rho)+delta)
+    return (lower,upper)

-def ranks():
-  # Combine segment measures:
-  #  segID,rank corr. wrt ALL,inverse variance, mean cross rank corr.,first disagreement
-  # convert to ranks, smallest value == highest rank
-  all_ranked=stats.rankdata(-ALL,method='average') # invert since
-                                                   #  large corr is good
-  x_variance_ranked=stats.rankdata([xd[i].variance for i in range(N)])
-                                                  # small corr variance is good
-  x_mean_ranked=stats.rankdata([-(xd[i].mean) for i in range(N)])
-                                                   # invert since
-                                                   #  large mean corr is good
-  fd_ranked=stats.rankdata([-first_diff(x_ranks[i]) for i in range(N)])
-                                                   # invert since
-                                                   #  large first diff is good
-  return np.array([[i,
-                    all_ranked[i],
-                    x_variance_ranked[i],
-                    x_mean_ranked[i],
-                    fd_ranked[i]] for i in range(N)])
+  def plot_ci(rhos,n,trim=None,conf=0.95):
+     # rhos are (rank) self.correlation values
+     rhos_s=rhos[(-rhos).argsort()]
+     if trim is None:
+       l=len(rhos)
+     else:
+       rhos_s=rhos_s[:trim]
+       l=trim
+     cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
+     ue=cc[1]-rhos_s
+     le=rhos_s-cc[0]
+     #for i in range(len(rhos)):
+       #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
+     plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
+     plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100))
+     plt.show()

-def main(file,id):
-  # this should be a class with all these globals as instance vars
-  global counts, corr, ALL, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr
-  global aa, aa_by_all, N
-  counts=loadtxt(file+".csv",delimiter=',')
-  N=counts.shape[1]-1
-  # "If axis=0 (default), then each column represents a variable, with
-  #        observations in the rows"
-  # So each column is a sequence of counts, for whole crawl in column 0
-  #   and for segments 0--N-1 in columns 1--N
-  corr=stats.spearmanr(counts,nan_policy='omit').correlation
+  def first_diff(ranks):
+    # first disagreement with baseline == {1,2,...}
+    for i in range(len(ranks)):
+      if ranks[i]!=i+1.0:
+        return i
+    return i+1

-  ALL=corr[0][1:]
-  all_s=stats.describe(ALL)
-  all_m=all_s.mean
-
-  x=np.array([np.concatenate((corr[i][1:i],
-                              corr[i][i+1:])) for i in range(1,N+1)])
-  # The above, although transposed, works because the correlation matrix
-  #  is symmetric
-  xd=[stats.describe(x[i]) for i in range(N)]
-  xs=stats.describe(np.array([xd[i].mean for i in range(N)]))
-  xm=xs.mean
-  xsd=np.sqrt(xs.variance)
-
-  x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)]
-
-  aa=ranks()
-  aa_by_all=aa[aa[:,1].argsort()]
+  def ranks():
+    # Combine segment measures:
+    #  segID,rank self.corr. wrt self.all,inverse variance, mean cross rank self.corr.,first disagreement
+    # convert to ranks, smallest value == highest rank
+    all_ranked=stats.rankdata(-self.all,method='average') # invert since
+                                                     #  large self.corr is good
+    x_variance_ranked=stats.rankdata([self.xd[i].variance for i in range(self.N)])
+                                                    # small self.corr variance is good
+    x_mean_ranked=stats.rankdata([-(self.xd[i].mean) for i in range(self.N)])
+                                                     # invert since
+                                                     #  large mean self.corr is good
+    fd_ranked=stats.rankdata([-first_diff(self.x_ranks[i]) for i in range(self.N)])
+                                                     # invert since
+                                                     #  large first diff is good
+    return np.array([[i,
+                      all_ranked[i],
+                      x_variance_ranked[i],
+                      x_mean_ranked[i],
+                      fd_ranked[i]] for i in range(self.N)])

 if __name__ == '__main__':
-  sys.exit(main(*sys.argv[1:]))
-
+  S = CCStats(*sys.argv[1:])

-### I need to review rows, e.g. counts[0] is an array of N+1 counts
+### I need to review rows, e.g. self.counts[0] is an array of self.N+1 self.counts
 ###   for the most common label in the complete crawl,
-###   from the complete crawl and ALL the segments
-### versus columns, e.g. counts[:,0] is an array of N decreasing counts
+###   from the complete crawl and all the segments
+### versus columns, e.g. self.counts[:,0] is an array of self.N decreasing self.counts
 ###   for all the labels in the complete crawl