Mercurial > hg > python
comparison signif.py @ 0:fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 09 Mar 2020 14:58:04 +0000 |
parents | |
children | 4d9778ade7b2 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fee51ab07d09 |
---|---|
1 from nltk import FreqDist | |
2 from random import randint | |
3 import pylab | |
4 from math import sqrt | |
5 | |
6 def mean(self): | |
7 # Assumes the keys of this distribution are numbers! | |
8 return float(sum(v*self[v] for v in self.keys()))/self.N() | |
9 | |
10 FreqDist.mean=mean | |
11 | |
12 def bell(self,maxVal=None,bars=False,**kwargs): | |
13 # Assumes the keys of this distribution are numbers! | |
14 if maxVal is not None: | |
15 sk = sorted([k for k in self.keys() if k<=maxVal]) # range(max(self.keys())+1) | |
16 else: | |
17 sk=sorted(self.keys()) | |
18 print len(sk) | |
19 #sk.append(sk[-1]+1) | |
20 #sk[0:0]=[(sk[0]-1)] | |
21 mm=0 # sk[0] | |
22 mean = self.mean() | |
23 tot = 0 | |
24 ssd = 0 | |
25 for v in self.keys(): | |
26 d = v-mean | |
27 ssd+=d*d*self[v] | |
28 sd=sqrt(ssd/float(self.N())) | |
29 #print (mean,sd) | |
30 kv=[self[k] for k in sk] | |
31 pylab.figure().subplots_adjust(bottom=0.15) | |
32 pylab.plot(sk,kv,color='blue') | |
33 if kwargs['xtra']: | |
34 xtra=kwargs['xtra'] | |
35 pylab.plot(sk,[xtra[k] for k in sk],color='red') | |
36 if bars: | |
37 pylab.bar([s-mm for s in sk],kv, | |
38 align='center',color='white',edgecolor='pink') | |
39 pylab.xticks(sk,rotation=90) | |
40 mv=self[self.max()] | |
41 bb=(-mv/10,mv+(mv/10)) | |
42 pylab.plot((mean-mm,mean-mm),bb, | |
43 (mean-mm-sd,mean-mm-sd),bb, | |
44 (mean-mm-(2*sd),mean-mm-(2*sd)),bb, | |
45 (mean-mm+sd,mean-mm+sd),bb, | |
46 (mean-mm+(2*sd),mean-mm+(2*sd)),bb, | |
47 color='green') | |
48 pylab.xlabel("N %s, max %s\nmean %5.2f, s.d. %5.2f"%(self.N(),mv,mean, sd)) | |
49 pylab.show() | |
50 | |
51 FreqDist.bell=bell | |
52 | |
53 def ranks(l,**kvargs): | |
54 # compute the rank of every element in a list | |
55 # uses sort, passing on all kv args | |
56 # uses key kv arg itself | |
57 # _Very_ inefficient, in several ways! | |
58 # Result is a pair: | |
59 # list of ranks | |
60 # list of tie information, each elt the magnitude of a tie group | |
61 s=sorted(l,**kvargs) | |
62 i=0 | |
63 res=[] | |
64 td=[] | |
65 if kvargs.has_key('key'): | |
66 kf=kvargs['key'] | |
67 else: | |
68 kf=lambda x:x | |
69 while i<len(l): | |
70 ties=[x for x in s if kf(s[i])==kf(x)] | |
71 if len(ties)>1: | |
72 td.append(len(ties)) | |
73 r=float(i+1+(i+len(ties)))/2.0 | |
74 for e in ties: | |
75 res.append((r,e)) | |
76 i+=1 | |
77 return (res,td) | |
78 | |
79 def mannWhitneyU(fd1,fd2,forceZ=False): | |
80 # Compute Mann Whitney U test for two frequency distributions | |
81 # For n1 and n2 <= 20, see http://www.soc.univ.keiv.ua/LIB/PUB/T/textual.pdf | |
82 # to look up significance levels on the result: see Part 3 section 10, | |
83 # actual page 150 (printed page 144) | |
84 # Or use http://faculty.vassar.edu/lowry/utest.html to do it for you | |
85 # For n1 and n2 > 20, U itself is normally distributed, we | |
86 # return a tuple with a z-test value | |
87 # HST DOES NOT BELIEVE THIS IS CORRECT -- DOES NOT APPEAR TO GIVE CORRECT ANSWERS!! | |
88 r1=[(lambda x:x.append(1) or x)(list(x)) for x in fd1.items()] | |
89 r2=[(lambda x:x.append(2) or x)(list(x)) for x in fd2.items()] | |
90 n1=len(r1) | |
91 n2=len(r2) | |
92 (ar,ties)=ranks(r1+r2,key=lambda e:e[1]) | |
93 s1=sum(r[0] for r in ar if r[1][2] is 1) | |
94 s2=sum(r[0] for r in ar if r[1][2] is 2) | |
95 u1=float(n1*n2)+(float(n1*(n1+1))/2.0)-float(s1) | |
96 u2=float(n1*n2)+(float(n2*(n2+1))/2.0)-float(s2) | |
97 u=min(u1,u2) | |
98 if forceZ or n1>20 or n2>20: | |
99 # we can treat U as sample from a normal distribution, and compute | |
100 # a z-score | |
101 # See e.g. http://mlsc.lboro.ac.uk/resources/statistics/Mannwhitney.pdf | |
102 mu=float(n1*n2)/2.0 | |
103 if len(ties)>0: | |
104 n=float(n1+n2) | |
105 ts=sum((float((t*t*t)-t)/12.0) for t in ties) | |
106 su=sqrt((float(n1*n2)/(n*n-1))*((float((n*n*n)-n)/12.0)-ts)) | |
107 else: | |
108 su=sqrt(float(n1*n2*(n1+n2+1))/12.0) | |
109 z=(u-mu)/su | |
110 return (n1,n2,u,z) | |
111 else: | |
112 return (n1,n2,u) | |
113 | |
114 # This started from http://dr-adorio-adventures.blogspot.com/2010/05/draft-untested.html | |
115 # but has a number of bug fixes | |
116 def Rank(l,**kvargs): | |
117 # compute the rank of every element in a list | |
118 # uses sort, passing on all kv args | |
119 # uses key kv arg itself | |
120 # _Very_ inefficient, in several ways! | |
121 # Result is a list of pairs ( r, v) where r is a rank and v is an input value | |
122 s=sorted(l,**kvargs) | |
123 i=0 | |
124 res=[] | |
125 if kvargs.has_key('key'): | |
126 kf=kvargs['key'] | |
127 else: | |
128 kf=lambda x:x | |
129 while i<len(l): | |
130 ties=[x for x in s if kf(s[i])==kf(x)] | |
131 r=float(i+1+(i+len(ties)))/2.0 | |
132 #print (i,r,ties) | |
133 for e in ties: | |
134 res.append((r,e)) | |
135 i+=1 | |
136 return (res) | |
137 | |
138 def mannWhitney(S1, S2): | |
139 """ | |
140 Returns the Mann-Whitney U statistic of two samples S1 and S2. | |
141 """ | |
142 # Form a single array with a categorical variable indicate the sample | |
143 X = [(s, 0) for s in S1] | |
144 X.extend([(s,1) for s in S2]) | |
145 R = Rank(X,key=lambda x:x[0]) | |
146 | |
147 # Compute needed parameters. | |
148 n1 = float(len(S1)) | |
149 n2 = float(len(S2)) | |
150 | |
151 # Compute total ranks for sample 1. | |
152 R1 = sum([i for i, (x,j) in R if j == 0]) | |
153 R2 = sum([i for i, (x,j) in R if j == 1]) | |
154 u1 = (n1*n2)+((n1*(n1+1))/2.0)-R1 | |
155 u2 = n1 * n2 - u1 | |
156 U = min(u1, u2) | |
157 #print u1,R1/n1,R2/n2 | |
158 | |
159 mU = n1 * n2 / 2.0 | |
160 sigmaU = sqrt((n1 * n2 * (n1 + n2 + 1))/12.0) | |
161 return u1, R1/n1,R2/n2, (U-mU)/sigmaU | |
162 |