diff signif.py @ 0:fee51ab07d09

blanket publication of all existing python files in lib/python on maritain
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 09 Mar 2020 14:58:04 +0000
parents
children 4d9778ade7b2
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/signif.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,162 @@
+from nltk import FreqDist
+from random import randint
+import pylab
+from math import sqrt
+
+def mean(self):
+  # Assumes the keys of this distribution are numbers!
+  return float(sum(v*self[v] for v in self.keys()))/self.N()
+
+FreqDist.mean=mean
+
+def bell(self,maxVal=None,bars=False,**kwargs):
+  # Assumes the keys of this distribution are numbers!
+  if maxVal is not None:
+    sk = sorted([k for k in self.keys() if k<=maxVal]) # range(max(self.keys())+1)
+  else:
+    sk=sorted(self.keys())
+  print len(sk)
+  #sk.append(sk[-1]+1)
+  #sk[0:0]=[(sk[0]-1)]
+  mm=0 # sk[0]
+  mean = self.mean()
+  tot = 0
+  ssd = 0
+  for v in self.keys():
+    d = v-mean
+    ssd+=d*d*self[v]
+  sd=sqrt(ssd/float(self.N()))
+  #print (mean,sd)
+  kv=[self[k] for k in sk]
+  pylab.figure().subplots_adjust(bottom=0.15)
+  pylab.plot(sk,kv,color='blue')
+  if kwargs['xtra']:
+    xtra=kwargs['xtra']
+    pylab.plot(sk,[xtra[k] for k in sk],color='red')
+  if bars:
+    pylab.bar([s-mm for s in sk],kv,
+              align='center',color='white',edgecolor='pink')
+  pylab.xticks(sk,rotation=90)
+  mv=self[self.max()]
+  bb=(-mv/10,mv+(mv/10))
+  pylab.plot((mean-mm,mean-mm),bb,
+             (mean-mm-sd,mean-mm-sd),bb,
+             (mean-mm-(2*sd),mean-mm-(2*sd)),bb,
+             (mean-mm+sd,mean-mm+sd),bb,
+             (mean-mm+(2*sd),mean-mm+(2*sd)),bb,
+             color='green')
+  pylab.xlabel("N %s, max %s\nmean %5.2f, s.d. %5.2f"%(self.N(),mv,mean, sd))
+  pylab.show()
+
+FreqDist.bell=bell
+
+def ranks(l,**kvargs):
+  # compute the rank of every element in a list
+  # uses sort, passing on all kv args
+  # uses key kv arg itself
+  # _Very_ inefficient, in several ways!
+  # Result is a pair:
+  #  list of ranks
+  #  list of tie information, each elt the magnitude of a tie group
+  s=sorted(l,**kvargs)
+  i=0
+  res=[]
+  td=[]
+  if kvargs.has_key('key'):
+    kf=kvargs['key']
+  else:
+    kf=lambda x:x
+  while i<len(l):
+    ties=[x for x in s if kf(s[i])==kf(x)]
+    if len(ties)>1:
+      td.append(len(ties))
+    r=float(i+1+(i+len(ties)))/2.0
+    for e in ties:
+      res.append((r,e))
+      i+=1
+  return (res,td)
+
+def mannWhitneyU(fd1,fd2,forceZ=False):
+  # Compute Mann Whitney U test for two frequency distributions
+  # For n1 and n2 <= 20, see http://www.soc.univ.keiv.ua/LIB/PUB/T/textual.pdf
+  #  to look up significance levels on the result: see Part 3 section 10,
+  #  actual page 150 (printed page 144)
+  # Or use http://faculty.vassar.edu/lowry/utest.html to do it for you
+  # For n1 and n2 > 20, U itself is normally distributed, we
+  #  return a tuple with a z-test value
+  # HST DOES NOT BELIEVE THIS IS CORRECT -- DOES NOT APPEAR TO GIVE CORRECT ANSWERS!!
+  r1=[(lambda x:x.append(1) or x)(list(x)) for x in fd1.items()]
+  r2=[(lambda x:x.append(2) or x)(list(x)) for x in fd2.items()]
+  n1=len(r1)
+  n2=len(r2)
+  (ar,ties)=ranks(r1+r2,key=lambda e:e[1])
+  s1=sum(r[0] for r in ar if r[1][2] is 1)
+  s2=sum(r[0] for r in ar if r[1][2] is 2)
+  u1=float(n1*n2)+(float(n1*(n1+1))/2.0)-float(s1)
+  u2=float(n1*n2)+(float(n2*(n2+1))/2.0)-float(s2)
+  u=min(u1,u2)
+  if forceZ or n1>20 or n2>20:
+    # we can treat U as sample from a normal distribution, and compute
+    # a z-score
+    # See e.g. http://mlsc.lboro.ac.uk/resources/statistics/Mannwhitney.pdf
+    mu=float(n1*n2)/2.0
+    if len(ties)>0:
+      n=float(n1+n2)
+      ts=sum((float((t*t*t)-t)/12.0) for t in ties)
+      su=sqrt((float(n1*n2)/(n*n-1))*((float((n*n*n)-n)/12.0)-ts))
+    else:
+      su=sqrt(float(n1*n2*(n1+n2+1))/12.0)
+    z=(u-mu)/su
+    return (n1,n2,u,z)
+  else:
+    return (n1,n2,u)
+
+# This started from http://dr-adorio-adventures.blogspot.com/2010/05/draft-untested.html
+#  but has a number of bug fixes
+def Rank(l,**kvargs):
+  # compute the rank of every element in a list
+  # uses sort, passing on all kv args
+  # uses key kv arg itself
+  # _Very_ inefficient, in several ways!
+  # Result is a list of pairs ( r, v) where r is a rank and v is an input value
+  s=sorted(l,**kvargs)
+  i=0
+  res=[]
+  if kvargs.has_key('key'):
+    kf=kvargs['key']
+  else:
+    kf=lambda x:x
+  while i<len(l):
+    ties=[x for x in s if kf(s[i])==kf(x)]
+    r=float(i+1+(i+len(ties)))/2.0
+    #print (i,r,ties)
+    for e in ties:
+      res.append((r,e))
+      i+=1
+  return (res)
+
+def mannWhitney(S1, S2):
+    """
+    Returns the Mann-Whitney U statistic of two samples S1 and S2.
+    """
+    # Form a single array with a categorical variable indicate the sample
+    X = [(s, 0) for s in S1]
+    X.extend([(s,1) for s in S2])
+    R = Rank(X,key=lambda x:x[0])
+ 
+    # Compute needed parameters.
+    n1 = float(len(S1))
+    n2 = float(len(S2))
+ 
+    # Compute total ranks for sample 1.          
+    R1 = sum([i for i, (x,j) in R if j == 0])
+    R2 = sum([i for i, (x,j) in R if j == 1])
+    u1 = (n1*n2)+((n1*(n1+1))/2.0)-R1
+    u2 = n1 * n2 - u1
+    U = min(u1, u2)
+    #print u1,R1/n1,R2/n2
+ 
+    mU     = n1 * n2 / 2.0
+    sigmaU = sqrt((n1 * n2 * (n1 + n2 + 1))/12.0)
+    return u1, R1/n1,R2/n2, (U-mU)/sigmaU
+