comparison signif.py @ 0:fee51ab07d09

blanket publication of all existing python files in lib/python on maritain
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 09 Mar 2020 14:58:04 +0000
parents
children 4d9778ade7b2
comparison
equal deleted inserted replaced
-1:000000000000 0:fee51ab07d09
1 from nltk import FreqDist
2 from random import randint
3 import pylab
4 from math import sqrt
5
6 def mean(self):
7 # Assumes the keys of this distribution are numbers!
8 return float(sum(v*self[v] for v in self.keys()))/self.N()
9
10 FreqDist.mean=mean
11
12 def bell(self,maxVal=None,bars=False,**kwargs):
13 # Assumes the keys of this distribution are numbers!
14 if maxVal is not None:
15 sk = sorted([k for k in self.keys() if k<=maxVal]) # range(max(self.keys())+1)
16 else:
17 sk=sorted(self.keys())
18 print len(sk)
19 #sk.append(sk[-1]+1)
20 #sk[0:0]=[(sk[0]-1)]
21 mm=0 # sk[0]
22 mean = self.mean()
23 tot = 0
24 ssd = 0
25 for v in self.keys():
26 d = v-mean
27 ssd+=d*d*self[v]
28 sd=sqrt(ssd/float(self.N()))
29 #print (mean,sd)
30 kv=[self[k] for k in sk]
31 pylab.figure().subplots_adjust(bottom=0.15)
32 pylab.plot(sk,kv,color='blue')
33 if kwargs['xtra']:
34 xtra=kwargs['xtra']
35 pylab.plot(sk,[xtra[k] for k in sk],color='red')
36 if bars:
37 pylab.bar([s-mm for s in sk],kv,
38 align='center',color='white',edgecolor='pink')
39 pylab.xticks(sk,rotation=90)
40 mv=self[self.max()]
41 bb=(-mv/10,mv+(mv/10))
42 pylab.plot((mean-mm,mean-mm),bb,
43 (mean-mm-sd,mean-mm-sd),bb,
44 (mean-mm-(2*sd),mean-mm-(2*sd)),bb,
45 (mean-mm+sd,mean-mm+sd),bb,
46 (mean-mm+(2*sd),mean-mm+(2*sd)),bb,
47 color='green')
48 pylab.xlabel("N %s, max %s\nmean %5.2f, s.d. %5.2f"%(self.N(),mv,mean, sd))
49 pylab.show()
50
51 FreqDist.bell=bell
52
53 def ranks(l,**kvargs):
54 # compute the rank of every element in a list
55 # uses sort, passing on all kv args
56 # uses key kv arg itself
57 # _Very_ inefficient, in several ways!
58 # Result is a pair:
59 # list of ranks
60 # list of tie information, each elt the magnitude of a tie group
61 s=sorted(l,**kvargs)
62 i=0
63 res=[]
64 td=[]
65 if kvargs.has_key('key'):
66 kf=kvargs['key']
67 else:
68 kf=lambda x:x
69 while i<len(l):
70 ties=[x for x in s if kf(s[i])==kf(x)]
71 if len(ties)>1:
72 td.append(len(ties))
73 r=float(i+1+(i+len(ties)))/2.0
74 for e in ties:
75 res.append((r,e))
76 i+=1
77 return (res,td)
78
79 def mannWhitneyU(fd1,fd2,forceZ=False):
80 # Compute Mann Whitney U test for two frequency distributions
81 # For n1 and n2 <= 20, see http://www.soc.univ.keiv.ua/LIB/PUB/T/textual.pdf
82 # to look up significance levels on the result: see Part 3 section 10,
83 # actual page 150 (printed page 144)
84 # Or use http://faculty.vassar.edu/lowry/utest.html to do it for you
85 # For n1 and n2 > 20, U itself is normally distributed, we
86 # return a tuple with a z-test value
87 # HST DOES NOT BELIEVE THIS IS CORRECT -- DOES NOT APPEAR TO GIVE CORRECT ANSWERS!!
88 r1=[(lambda x:x.append(1) or x)(list(x)) for x in fd1.items()]
89 r2=[(lambda x:x.append(2) or x)(list(x)) for x in fd2.items()]
90 n1=len(r1)
91 n2=len(r2)
92 (ar,ties)=ranks(r1+r2,key=lambda e:e[1])
93 s1=sum(r[0] for r in ar if r[1][2] is 1)
94 s2=sum(r[0] for r in ar if r[1][2] is 2)
95 u1=float(n1*n2)+(float(n1*(n1+1))/2.0)-float(s1)
96 u2=float(n1*n2)+(float(n2*(n2+1))/2.0)-float(s2)
97 u=min(u1,u2)
98 if forceZ or n1>20 or n2>20:
99 # we can treat U as sample from a normal distribution, and compute
100 # a z-score
101 # See e.g. http://mlsc.lboro.ac.uk/resources/statistics/Mannwhitney.pdf
102 mu=float(n1*n2)/2.0
103 if len(ties)>0:
104 n=float(n1+n2)
105 ts=sum((float((t*t*t)-t)/12.0) for t in ties)
106 su=sqrt((float(n1*n2)/(n*n-1))*((float((n*n*n)-n)/12.0)-ts))
107 else:
108 su=sqrt(float(n1*n2*(n1+n2+1))/12.0)
109 z=(u-mu)/su
110 return (n1,n2,u,z)
111 else:
112 return (n1,n2,u)
113
114 # This started from http://dr-adorio-adventures.blogspot.com/2010/05/draft-untested.html
115 # but has a number of bug fixes
116 def Rank(l,**kvargs):
117 # compute the rank of every element in a list
118 # uses sort, passing on all kv args
119 # uses key kv arg itself
120 # _Very_ inefficient, in several ways!
121 # Result is a list of pairs ( r, v) where r is a rank and v is an input value
122 s=sorted(l,**kvargs)
123 i=0
124 res=[]
125 if kvargs.has_key('key'):
126 kf=kvargs['key']
127 else:
128 kf=lambda x:x
129 while i<len(l):
130 ties=[x for x in s if kf(s[i])==kf(x)]
131 r=float(i+1+(i+len(ties)))/2.0
132 #print (i,r,ties)
133 for e in ties:
134 res.append((r,e))
135 i+=1
136 return (res)
137
138 def mannWhitney(S1, S2):
139 """
140 Returns the Mann-Whitney U statistic of two samples S1 and S2.
141 """
142 # Form a single array with a categorical variable indicate the sample
143 X = [(s, 0) for s in S1]
144 X.extend([(s,1) for s in S2])
145 R = Rank(X,key=lambda x:x[0])
146
147 # Compute needed parameters.
148 n1 = float(len(S1))
149 n2 = float(len(S2))
150
151 # Compute total ranks for sample 1.
152 R1 = sum([i for i, (x,j) in R if j == 0])
153 R2 = sum([i for i, (x,j) in R if j == 1])
154 u1 = (n1*n2)+((n1*(n1+1))/2.0)-R1
155 u2 = n1 * n2 - u1
156 U = min(u1, u2)
157 #print u1,R1/n1,R2/n2
158
159 mU = n1 * n2 / 2.0
160 sigmaU = sqrt((n1 * n2 * (n1 + n2 + 1))/12.0)
161 return u1, R1/n1,R2/n2, (U-mU)/sigmaU
162