changeset 181:0058e523f649

add explore_deltas and predict analysis fns
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 27 Nov 2023 18:25:39 +0000
parents a20cf98f3a77
children 45120df952be
files lib/python/cc/ccstats.py
diffstat 1 files changed, 102 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/ccstats.py	Sun Nov 26 21:24:38 2023 +0000
+++ b/lib/python/cc/ccstats.py	Mon Nov 27 18:25:39 2023 +0000
@@ -23,13 +23,19 @@
 import matplotlib.pyplot as plt
 import pylab
 
-import sys, math
+import sys, math, os.path, random
 
+SS = {}
 class CCStats:
-  def __init__(self, file, id):
-    self.file = file
-    self.id = id
-    self.counts=loadtxt(file+".csv",delimiter=',')
+  def __init__(self, year_month, name, file_base):
+    global SS
+    self.year_month = year_month
+    self.name = name
+    self.id = "%s %s"%(year_month,name)
+    SS[self.id] = self
+    self.file = os.path.expanduser("~/results/CC-MAIN-%s/%s.csv")%(year_month,
+                                                                   file_base)
+    self.counts=loadtxt(self.file,delimiter=',')
     self.N=self.counts.shape[1]-1
     # "If axis=0 (default), then each column represents a variable, with
     #        observations in the rows"
@@ -40,6 +46,7 @@
     self.all=self.corr[0][1:]
     self.all_s=stats.describe(self.all)
     self.all_m=self.all_s.mean
+    self.all_s.stdev=math.sqrt(self.all_s.variance)
 
     self.x=np.array([np.concatenate((self.corr[i][1:i],
                                 self.corr[i][i+1:])) for i in range(1,self.N+1)])
@@ -59,7 +66,7 @@
   def qqa(self):
     # q-q plot for the whole crawl
     sm.qqplot(self.all, line='s')
-    plt.gca().set_title('Rank self.correlation per segment wrt whole archive %s'%id)
+    plt.gca().set_title('Rank self.correlation per segment wrt whole archive %s'%self.id)
     plt.show()
 
   def qqs(self):
@@ -105,30 +112,34 @@
     plt.show(block=block)
 
   def hist_x(self,align='mid'):
-    self.hist(self.xm,self.xsd,[self.xd[i].mean for i in range(self.N)],
+    hist(self.xm,self.xsd,[self.xd[i].mean for i in range(self.N)],
          'Mean of rank self.correlation of each segment self.x self.all other segments',
          align)
 
   def hist_all(self,align='mid'):
-    self.hist(self.all_m,np.sqrt(self.all_s.variance),self.all,
-         'Rank self.correlation of each segment self.x whole crawl %s'%id,
+    hist(self.all_m,np.sqrt(self.all_s.variance),self.all,
+         'Rank self.correlation of each segment self.x whole crawl %s'%self.id,
          align)
 
-  def plot_ci(self,rhos,n,trim=None,conf=0.95):
+  def plot_ci(self,rhos=None,n=None,trim=None,conf=0.95):
      # rhos are (rank) correlation values
+     if rhos is None:
+       rhos = self.all
+     if n is None:
+       n = rhos.shape[0]
      rhos_s=rhos[(-rhos).argsort()]
      if trim is None:
        l=len(rhos)
      else:
        rhos_s=rhos_s[:trim]
        l=trim
-     cc=(np.array([self.ci(r,n,conf) for r in rhos_s])).T
+     cc=(np.array([ci(r,n,conf) for r in rhos_s])).T
      ue=cc[1]-rhos_s
      le=rhos_s-cc[0]
      #for i in range(len(rhos)):
        #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
      plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
-     plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100))
+     plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(self.id,conf*100))
      plt.show()
 
   def ranks(self):
@@ -151,19 +162,19 @@
                       x_mean_ranked[i],
                       fd_ranked[i]] for i in range(self.N)])
 
-  def hist(m,sd,hh,title,align):
-    sdd=[(i,m-(i*sd)) for i in range(-2,3)]
-    fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
-    sdax=hax.twiny()
-    hax.hist(hh,color='lightblue',align=align)
-    hax.set_title(title)
-    for s,v in sdd:
-         sdax.plot([v,v],[0,18],'b')
-    sdax.set_xlim(hax.get_xlim())
-    sdax.set_ylim(hax.get_ylim())
-    sdax.set_xticks([v for s,v in sdd])
-    sdax.set_xticklabels([str(s) for s,v in sdd])
-    plt.show()
+def hist(m,sd,hh,title,align):
+  sdd=[(i,m-(i*sd)) for i in range(-2,3)]
+  fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
+  sdax=hax.twiny()
+  hax.hist(hh,color='lightblue',align=align)
+  hax.set_title(title)
+  for s,v in sdd:
+       sdax.plot([v,v],[0,18],'b')
+  sdax.set_xlim(hax.get_xlim())
+  sdax.set_ylim(hax.get_ylim())
+  sdax.set_xticks([v for s,v in sdd])
+  sdax.set_xticklabels([str(s) for s,v in sdd])
+  plt.show()
 
 def first_diff(ranks):
   # first disagreement with baseline == {1,2,...}
@@ -182,6 +193,70 @@
   upper=math.tanh(math.atanh(rho)+delta)
   return (lower,upper)
 
+def explore_deltas(basis_ym, basis_names, target_ym, target_names, nn):
+  for n in nn:
+    for p in basis_names:
+      for q in target_names:
+        basis=SS[f'{basis_ym} {p}']
+        target=SS[f'{target_ym} {q}']
+        m = int((basis.N - n) / 2)
+        best = stats.describe(target.all[np.array(basis.iranks_by_all[:n])]).mean
+        middle = stats.describe(target.all[np.array(basis.iranks_by_all[m:m+n])]).mean
+        worst = stats.describe(target.all[np.array(basis.iranks_by_all[-n:])]).mean
+        delta1 = best-worst
+        stdev = math.sqrt(target.all_s.variance)
+        ds1 = delta1 / stdev
+        delta2 = abs(middle-target.all_m)
+        ds2 = delta2 / stdev
+        print(n,p,q,'%6.4g'%delta1,'%4.2g'%ds1,'%6.4g'%delta2,'%4.2g'%ds2)
+
+def predict(basis_ym, basis_name, target_ym, target_name, n):
+  basis = SS['%s %s'%(basis_ym, basis_name)]
+  target = SS['%s %s'%(target_ym, target_name)]
+  best = stats.describe(target.all[np.array(basis.iranks_by_all[:n])])
+  tbest = stats.describe(target.all[np.array(target.iranks_by_all[:n])])
+  m = int((basis.N - n) / 2)
+  middle = stats.describe(target.all[np.array(basis.iranks_by_all[m:m+n])])
+  tmiddle = stats.describe(target.all[np.array(target.iranks_by_all[m:m+n])])
+  randind = list(range(100))
+  random.shuffle(randind)
+  rlist = np.array(randind[:n])
+  randist = stats.describe(target.all[np.array(basis.iranks_by_all)[rlist]])
+  trandist = stats.describe(target.all[rlist])
+  worst = stats.describe(target.all[np.array(basis.iranks_by_all[-n:])])
+  tworst = stats.describe(target.all[np.array(target.iranks_by_all[-n:])])
+  print(f"Predicting {basis.id} from {target.id}, window size {n}")
+  print("            Prediction        Actual")   
+  print("           mean   stdev   mean   stdev  value")
+  print("   Best %d %-6.4g %-6.3g  %-6.4g %-6.3g %-6.4g"%(n, best.mean,
+                                                     round(math.sqrt(best.variance),
+                                                           4),
+                                                     tbest.mean,
+                                                     round(math.sqrt(tbest.variance),
+                                                           4),
+                                                     target.all_s.minmax[1]))
+  print(" Middle %d %-6.4g %-6.3g  %-6.4g %-6.3g %-6.4g"%(n, middle.mean,
+                                                     round(math.sqrt(middle.variance),
+                                                           4),
+                                                     tmiddle.mean,
+                                                     round(math.sqrt(tmiddle.variance),
+                                                           4),
+                                                     target.all_m))
+  print("Randist %d %-6.4g %-6.3g  %-6.4g %-6.3g %-6.4g"%(n, randist.mean,
+                                                     round(math.sqrt(randist.variance),
+                                                           4),
+                                                     trandist.mean,
+                                                     round(math.sqrt(trandist.variance),
+                                                           4),
+                                                     target.all_m))
+  print("  Worst %d %-6.4g %-6.3g  %-6.4g %-6.3g %-6.4g"%(n, worst.mean,
+                                                     round(math.sqrt(worst.variance),
+                                                           4),
+                                                     tworst.mean,
+                                                     round(math.sqrt(tworst.variance),
+                                                           4),
+                                                     target.all_s.minmax[0]))
+
 if __name__ == '__main__':
   S = CCStats(*sys.argv[1:])
 
@@ -190,3 +265,5 @@
 ###   from the complete crawl and all the segments
 ### versus columns, e.g. self.counts[:,0] is an array of self.N decreasing self.counts
 ###   for all the labels in the complete crawl
+
+