Mercurial > hg > cc > cirrus_work

--- a/bin/spearman.py	Tue Dec 13 14:16:22 2022 +0000
+++ b/bin/spearman.py	Tue Dec 13 14:16:42 2022 +0000
@@ -12,7 +12,7 @@
      * 100 rows, one for each of the top 100 counts
      * 101 columns, 0 for all and 1--100 for segs 0--99

-   Usage: python3 -i spearman.py name
+   Usage: python3 -i spearman.py name id
      where name.csv has the input
 '''

@@ -28,7 +28,7 @@
 def qqa():
   # q-q plot for the whole crawl
   sm.qqplot(all, line='s')
-  plt.gca().set_title('Rank correlation per segment wrt whole crawl (warc results only)')
+  plt.gca().set_title('Rank correlation per segment wrt whole archive %s'%id)
   plt.show()

 def qqs():
@@ -45,7 +45,7 @@
   plt.gca().set_title('Worst segment (most variance): %s'%xworst)
   plt.show()

-def plot_x(sort=False,block=True):
+def plot_x(sort=False,block=True,all_only=True,title=None):
   # Make these two subplots, w. and w/o sorting
   # See https://stackoverflow.com/questions/4700614/how-to-put-the-legend-outside-the-plot
   #  for legend hacking
@@ -56,28 +56,38 @@
   else:
     plot_all=all
     plot_x=[xd[i].mean for i in range(N)]
-  plt.plot(plot_all,'rx',label='Rank correlation of segment x whole crawl')
-  plt.plot([0,N-1],[all_m,all_m],'r',label='Mean of segment x whole crawl')
-  plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
-  plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
-  plt.axis([0,N-1,0.8,1.0])
-  plt.legend(loc='best',fontsize='small')
+  if title is None:
+    l1='Rank correlation of segment x whole crawl'
+    l2='Mean of segment x whole crawl'
+    plt.legend(loc='best',fontsize='small')
+  else:
+    l1=l2=None
+  plt.plot(plot_all,'rx',label=l1)
+  plt.plot([0,N-1],[all_m,all_m],'r',label=l2)
+  if not(all_only):
+    plt.plot(plot_x,'bx',label='Mean of rank correlation of each segment x all other segments')
+    plt.plot([0,N-1],[xm,xm],'b',label='Mean of segment x segment means')
+  plt.axis([0,N-1,0.85 if all_only else 0.8,1.0])
   plt.grid(True)
+  if title is not None:
+    plt.title(title)
   plt.show(block=block)

-def hist_x():
+def hist_x(align='mid'):
   hist(xm,xsd,[xd[i].mean for i in range(N)],
-       'Mean of rank correlation of each segment x all other segments')
+       'Mean of rank correlation of each segment x all other segments',
+       align)

-def hist_all():
+def hist_all(align='mid'):
   hist(all_m,np.sqrt(all_s.variance),all,
-       'Rank correlation of each segment x whole crawl')
+       'Rank correlation of each segment x whole crawl %s'%id,
+       align)

-def hist(m,sd,hh,title):
+def hist(m,sd,hh,title,align):
   sdd=[(i,m-(i*sd)) for i in range(-2,3)]
   fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497
   sdax=hax.twiny()
-  hax.hist(hh,color='lightblue')
+  hax.hist(hh,color='lightblue',align=align)
   hax.set_title(title)
   for s,v in sdd:
        sdax.plot([v,v],[0,18],'b')
@@ -111,6 +121,7 @@
    #for i in range(len(rhos)):
      #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1])
    plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o')
+   plt.title("Rank correlation of segments x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100))
    plt.show()

 def first_diff(ranks):
@@ -140,28 +151,35 @@
                     x_mean_ranked[i],
                     fd_ranked[i]] for i in range(N)])

-counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
-N=counts.shape[1]-1
-# "If axis=0 (default), then each column represents a variable, with
-#        observations in the rows"
-# So each column is a sequence of counts, for whole crawl in column 0
-#   and for segments 0--N-1 in columns 1--N
-corr=stats.spearmanr(counts,nan_policy='omit').correlation
+def main():
+  global counts, id, corr, all, all_s, all_m, x, xd, xs, xm, xsd, x_ranks, rr
+  global aa, aa_by_all, N
+  counts=loadtxt(sys.argv[1]+".csv",delimiter=',')
+  id=sys.argv[2]
+  N=counts.shape[1]-1
+  # "If axis=0 (default), then each column represents a variable, with
+  #        observations in the rows"
+  # So each column is a sequence of counts, for whole crawl in column 0
+  #   and for segments 0--N-1 in columns 1--N
+  corr=stats.spearmanr(counts,nan_policy='omit').correlation

-all=corr[0][1:]
-all_s=stats.describe(all)
-all_m=all_s.mean
+  all=corr[0][1:]
+  all_s=stats.describe(all)
+  all_m=all_s.mean

-x=np.array([np.concatenate((corr[i][1:i],
-                            corr[i][i+1:])) for i in range(1,N+1)])
-# The above, although transposed, works because the correlation matrix
-#  is symmetric
-xd=[stats.describe(x[i]) for i in range(N)]
-xs=stats.describe(np.array([xd[i].mean for i in range(N)]))
-xm=xs.mean
-xsd=np.sqrt(xs.variance)
+  x=np.array([np.concatenate((corr[i][1:i],
+                              corr[i][i+1:])) for i in range(1,N+1)])
+  # The above, although transposed, works because the correlation matrix
+  #  is symmetric
+  xd=[stats.describe(x[i]) for i in range(N)]
+  xs=stats.describe(np.array([xd[i].mean for i in range(N)]))
+  xm=xs.mean
+  xsd=np.sqrt(xs.variance)

-x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)]
+  x_ranks=[stats.rankdata(-counts[:,i],method='average') for i in range(1,N+1)]
+
+  aa=ranks()
+  aa_by_all=aa[aa[:,1].argsort()]

 ### I need to review rows, e.g. counts[0] is an array of N+1 counts
 ###   for the most common label in the complete crawl,