# HG changeset patch # User Henry S. Thompson # Date 1701109539 0 # Node ID 0058e523f6497845396cf1f374982c4f49ae3532 # Parent a20cf98f3a7751e79d8a58b263d010f2c9168137 add explore_deltas and predict analysis fns diff -r a20cf98f3a77 -r 0058e523f649 lib/python/cc/ccstats.py --- a/lib/python/cc/ccstats.py Sun Nov 26 21:24:38 2023 +0000 +++ b/lib/python/cc/ccstats.py Mon Nov 27 18:25:39 2023 +0000 @@ -23,13 +23,19 @@ import matplotlib.pyplot as plt import pylab -import sys, math +import sys, math, os.path, random +SS = {} class CCStats: - def __init__(self, file, id): - self.file = file - self.id = id - self.counts=loadtxt(file+".csv",delimiter=',') + def __init__(self, year_month, name, file_base): + global SS + self.year_month = year_month + self.name = name + self.id = "%s %s"%(year_month,name) + SS[self.id] = self + self.file = os.path.expanduser("~/results/CC-MAIN-%s/%s.csv")%(year_month, + file_base) + self.counts=loadtxt(self.file,delimiter=',') self.N=self.counts.shape[1]-1 # "If axis=0 (default), then each column represents a variable, with # observations in the rows" @@ -40,6 +46,7 @@ self.all=self.corr[0][1:] self.all_s=stats.describe(self.all) self.all_m=self.all_s.mean + self.all_s.stdev=math.sqrt(self.all_s.variance) self.x=np.array([np.concatenate((self.corr[i][1:i], self.corr[i][i+1:])) for i in range(1,self.N+1)]) @@ -59,7 +66,7 @@ def qqa(self): # q-q plot for the whole crawl sm.qqplot(self.all, line='s') - plt.gca().set_title('Rank self.correlation per segment wrt whole archive %s'%id) + plt.gca().set_title('Rank self.correlation per segment wrt whole archive %s'%self.id) plt.show() def qqs(self): @@ -105,30 +112,34 @@ plt.show(block=block) def hist_x(self,align='mid'): - self.hist(self.xm,self.xsd,[self.xd[i].mean for i in range(self.N)], + hist(self.xm,self.xsd,[self.xd[i].mean for i in range(self.N)], 'Mean of rank self.correlation of each segment self.x self.all other segments', align) def hist_all(self,align='mid'): - self.hist(self.all_m,np.sqrt(self.all_s.variance),self.all, - 'Rank self.correlation of each segment self.x whole crawl %s'%id, + hist(self.all_m,np.sqrt(self.all_s.variance),self.all, + 'Rank self.correlation of each segment self.x whole crawl %s'%self.id, align) - def plot_ci(self,rhos,n,trim=None,conf=0.95): + def plot_ci(self,rhos=None,n=None,trim=None,conf=0.95): # rhos are (rank) correlation values + if rhos is None: + rhos = self.all + if n is None: + n = rhos.shape[0] rhos_s=rhos[(-rhos).argsort()] if trim is None: l=len(rhos) else: rhos_s=rhos_s[:trim] l=trim - cc=(np.array([self.ci(r,n,conf) for r in rhos_s])).T + cc=(np.array([ci(r,n,conf) for r in rhos_s])).T ue=cc[1]-rhos_s le=rhos_s-cc[0] #for i in range(len(rhos)): #print(cc[i][0],rhos_s[i]-cc[i][0],rhos_s[i],cc[i][1],-rhos_s[i]+cc[i][1]) plt.errorbar(np.arange(l),rhos_s,yerr=[le,ue],fmt='o') - plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(id,conf*100)) + plt.title("Rank self.correlation of segments self.x whole archive %s\nwith confidence bars at %d%%"%(self.id,conf*100)) plt.show() def ranks(self): @@ -151,19 +162,19 @@ x_mean_ranked[i], fd_ranked[i]] for i in range(self.N)]) - def hist(m,sd,hh,title,align): - sdd=[(i,m-(i*sd)) for i in range(-2,3)] - fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 - sdax=hax.twiny() - hax.hist(hh,color='lightblue',align=align) - hax.set_title(title) - for s,v in sdd: - sdax.plot([v,v],[0,18],'b') - sdax.set_xlim(hax.get_xlim()) - sdax.set_ylim(hax.get_ylim()) - sdax.set_xticks([v for s,v in sdd]) - sdax.set_xticklabels([str(s) for s,v in sdd]) - plt.show() +def hist(m,sd,hh,title,align): + sdd=[(i,m-(i*sd)) for i in range(-2,3)] + fig,hax=plt.subplots() # Thanks to https://stackoverflow.com/a/7769497 + sdax=hax.twiny() + hax.hist(hh,color='lightblue',align=align) + hax.set_title(title) + for s,v in sdd: + sdax.plot([v,v],[0,18],'b') + sdax.set_xlim(hax.get_xlim()) + sdax.set_ylim(hax.get_ylim()) + sdax.set_xticks([v for s,v in sdd]) + sdax.set_xticklabels([str(s) for s,v in sdd]) + plt.show() def first_diff(ranks): # first disagreement with baseline == {1,2,...} @@ -182,6 +193,70 @@ upper=math.tanh(math.atanh(rho)+delta) return (lower,upper) +def explore_deltas(basis_ym, basis_names, target_ym, target_names, nn): + for n in nn: + for p in basis_names: + for q in target_names: + basis=SS[f'{basis_ym} {p}'] + target=SS[f'{target_ym} {q}'] + m = int((basis.N - n) / 2) + best = stats.describe(target.all[np.array(basis.iranks_by_all[:n])]).mean + middle = stats.describe(target.all[np.array(basis.iranks_by_all[m:m+n])]).mean + worst = stats.describe(target.all[np.array(basis.iranks_by_all[-n:])]).mean + delta1 = best-worst + stdev = math.sqrt(target.all_s.variance) + ds1 = delta1 / stdev + delta2 = abs(middle-target.all_m) + ds2 = delta2 / stdev + print(n,p,q,'%6.4g'%delta1,'%4.2g'%ds1,'%6.4g'%delta2,'%4.2g'%ds2) + +def predict(basis_ym, basis_name, target_ym, target_name, n): + basis = SS['%s %s'%(basis_ym, basis_name)] + target = SS['%s %s'%(target_ym, target_name)] + best = stats.describe(target.all[np.array(basis.iranks_by_all[:n])]) + tbest = stats.describe(target.all[np.array(target.iranks_by_all[:n])]) + m = int((basis.N - n) / 2) + middle = stats.describe(target.all[np.array(basis.iranks_by_all[m:m+n])]) + tmiddle = stats.describe(target.all[np.array(target.iranks_by_all[m:m+n])]) + randind = list(range(100)) + random.shuffle(randind) + rlist = np.array(randind[:n]) + randist = stats.describe(target.all[np.array(basis.iranks_by_all)[rlist]]) + trandist = stats.describe(target.all[rlist]) + worst = stats.describe(target.all[np.array(basis.iranks_by_all[-n:])]) + tworst = stats.describe(target.all[np.array(target.iranks_by_all[-n:])]) + print(f"Predicting {basis.id} from {target.id}, window size {n}") + print(" Prediction Actual") + print(" mean stdev mean stdev value") + print(" Best %d %-6.4g %-6.3g %-6.4g %-6.3g %-6.4g"%(n, best.mean, + round(math.sqrt(best.variance), + 4), + tbest.mean, + round(math.sqrt(tbest.variance), + 4), + target.all_s.minmax[1])) + print(" Middle %d %-6.4g %-6.3g %-6.4g %-6.3g %-6.4g"%(n, middle.mean, + round(math.sqrt(middle.variance), + 4), + tmiddle.mean, + round(math.sqrt(tmiddle.variance), + 4), + target.all_m)) + print("Randist %d %-6.4g %-6.3g %-6.4g %-6.3g %-6.4g"%(n, randist.mean, + round(math.sqrt(randist.variance), + 4), + trandist.mean, + round(math.sqrt(trandist.variance), + 4), + target.all_m)) + print(" Worst %d %-6.4g %-6.3g %-6.4g %-6.3g %-6.4g"%(n, worst.mean, + round(math.sqrt(worst.variance), + 4), + tworst.mean, + round(math.sqrt(tworst.variance), + 4), + target.all_s.minmax[0])) + if __name__ == '__main__': S = CCStats(*sys.argv[1:]) @@ -190,3 +265,5 @@ ### from the complete crawl and all the segments ### versus columns, e.g. self.counts[:,0] is an array of self.N decreasing self.counts ### for all the labels in the complete crawl + +