annotate signif.py @ 62:c82a8743fd48

taking notes on how to merge
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 14 Dec 2023 00:13:19 +0000
parents 4d9778ade7b2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 from nltk import FreqDist
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 from random import randint
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 import pylab
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 from math import sqrt
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 def mean(self):
32
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
7 try:
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
8 return self.mean_value
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
9 except AttributeError:
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
10 # Assumes the keys of this distribution are numbers!
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
11 self.mean_value=float(sum(v*self[v] for v in self.keys()))/self.N()
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
12 return self.mean_value
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
13
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
14 def sd(self):
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
15 try:
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
16 return self.sd_value
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
17 except AttributeError:
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
18 ssd = 0
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
19 for v in self.keys():
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
20 d = v-self.mean()
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
21 ssd+=d*d*self[v]
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
22 self.sd_value=sqrt(ssd/float(self.N()))
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
23 return self.sd_value
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 FreqDist.mean=mean
32
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
26 FreqDist.sd=sd
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27
32
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
28 def bell(self,maxVal=None,bars=False,block=True,**kwargs):
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 # Assumes the keys of this distribution are numbers!
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 if maxVal is not None:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 sk = sorted([k for k in self.keys() if k<=maxVal]) # range(max(self.keys())+1)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 sk=sorted(self.keys())
32
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
34 print(len(sk))
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 #sk.append(sk[-1]+1)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 #sk[0:0]=[(sk[0]-1)]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 mm=0 # sk[0]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 mean = self.mean()
32
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
39 sd = self.sd()
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 #print (mean,sd)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 kv=[self[k] for k in sk]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 pylab.figure().subplots_adjust(bottom=0.15)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 pylab.plot(sk,kv,color='blue')
32
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
44 if 'xtra' in kwargs and kwargs['xtra']:
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 xtra=kwargs['xtra']
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 pylab.plot(sk,[xtra[k] for k in sk],color='red')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 if bars:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 pylab.bar([s-mm for s in sk],kv,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 align='center',color='white',edgecolor='pink')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 pylab.xticks(sk,rotation=90)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 mv=self[self.max()]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 bb=(-mv/10,mv+(mv/10))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 pylab.plot((mean-mm,mean-mm),bb,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 (mean-mm-sd,mean-mm-sd),bb,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55 (mean-mm-(2*sd),mean-mm-(2*sd)),bb,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
56 (mean-mm+sd,mean-mm+sd),bb,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57 (mean-mm+(2*sd),mean-mm+(2*sd)),bb,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 color='green')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 pylab.xlabel("N %s, max %s\nmean %5.2f, s.d. %5.2f"%(self.N(),mv,mean, sd))
32
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
60 pylab.show(block=block)
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
61
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
62 FreqDist.bell=bell
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
63
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
64 def ranks(l,**kvargs):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
65 # compute the rank of every element in a list
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
66 # uses sort, passing on all kv args
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
67 # uses key kv arg itself
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
68 # _Very_ inefficient, in several ways!
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
69 # Result is a pair:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
70 # list of ranks
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
71 # list of tie information, each elt the magnitude of a tie group
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
72 s=sorted(l,**kvargs)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
73 i=0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
74 res=[]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
75 td=[]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
76 if kvargs.has_key('key'):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
77 kf=kvargs['key']
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
78 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
79 kf=lambda x:x
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
80 while i<len(l):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
81 ties=[x for x in s if kf(s[i])==kf(x)]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
82 if len(ties)>1:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
83 td.append(len(ties))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
84 r=float(i+1+(i+len(ties)))/2.0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
85 for e in ties:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
86 res.append((r,e))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
87 i+=1
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
88 return (res,td)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
89
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
90 def mannWhitneyU(fd1,fd2,forceZ=False):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
91 # Compute Mann Whitney U test for two frequency distributions
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
92 # For n1 and n2 <= 20, see http://www.soc.univ.keiv.ua/LIB/PUB/T/textual.pdf
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
93 # to look up significance levels on the result: see Part 3 section 10,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
94 # actual page 150 (printed page 144)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
95 # Or use http://faculty.vassar.edu/lowry/utest.html to do it for you
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
96 # For n1 and n2 > 20, U itself is normally distributed, we
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
97 # return a tuple with a z-test value
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
98 # HST DOES NOT BELIEVE THIS IS CORRECT -- DOES NOT APPEAR TO GIVE CORRECT ANSWERS!!
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
99 r1=[(lambda x:x.append(1) or x)(list(x)) for x in fd1.items()]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
100 r2=[(lambda x:x.append(2) or x)(list(x)) for x in fd2.items()]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
101 n1=len(r1)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
102 n2=len(r2)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
103 (ar,ties)=ranks(r1+r2,key=lambda e:e[1])
32
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
104 s1=sum(r[0] for r in ar if r[1][2] == 1)
4d9778ade7b2 python3, add sd and mean caching
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 0
diff changeset
105 s2=sum(r[0] for r in ar if r[1][2] == 2)
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
106 u1=float(n1*n2)+(float(n1*(n1+1))/2.0)-float(s1)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
107 u2=float(n1*n2)+(float(n2*(n2+1))/2.0)-float(s2)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
108 u=min(u1,u2)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
109 if forceZ or n1>20 or n2>20:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
110 # we can treat U as sample from a normal distribution, and compute
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
111 # a z-score
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
112 # See e.g. http://mlsc.lboro.ac.uk/resources/statistics/Mannwhitney.pdf
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
113 mu=float(n1*n2)/2.0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
114 if len(ties)>0:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
115 n=float(n1+n2)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
116 ts=sum((float((t*t*t)-t)/12.0) for t in ties)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
117 su=sqrt((float(n1*n2)/(n*n-1))*((float((n*n*n)-n)/12.0)-ts))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
118 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
119 su=sqrt(float(n1*n2*(n1+n2+1))/12.0)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
120 z=(u-mu)/su
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
121 return (n1,n2,u,z)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
122 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
123 return (n1,n2,u)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
124
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
125 # This started from http://dr-adorio-adventures.blogspot.com/2010/05/draft-untested.html
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
126 # but has a number of bug fixes
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
127 def Rank(l,**kvargs):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
128 # compute the rank of every element in a list
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
129 # uses sort, passing on all kv args
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
130 # uses key kv arg itself
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
131 # _Very_ inefficient, in several ways!
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
132 # Result is a list of pairs ( r, v) where r is a rank and v is an input value
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
133 s=sorted(l,**kvargs)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
134 i=0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
135 res=[]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
136 if kvargs.has_key('key'):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
137 kf=kvargs['key']
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
138 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
139 kf=lambda x:x
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
140 while i<len(l):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
141 ties=[x for x in s if kf(s[i])==kf(x)]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
142 r=float(i+1+(i+len(ties)))/2.0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
143 #print (i,r,ties)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
144 for e in ties:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
145 res.append((r,e))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
146 i+=1
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
147 return (res)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
148
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
149 def mannWhitney(S1, S2):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
150 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
151 Returns the Mann-Whitney U statistic of two samples S1 and S2.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
152 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
153 # Form a single array with a categorical variable indicate the sample
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
154 X = [(s, 0) for s in S1]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
155 X.extend([(s,1) for s in S2])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
156 R = Rank(X,key=lambda x:x[0])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
157
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
158 # Compute needed parameters.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
159 n1 = float(len(S1))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
160 n2 = float(len(S2))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
161
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
162 # Compute total ranks for sample 1.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
163 R1 = sum([i for i, (x,j) in R if j == 0])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
164 R2 = sum([i for i, (x,j) in R if j == 1])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
165 u1 = (n1*n2)+((n1*(n1+1))/2.0)-R1
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
166 u2 = n1 * n2 - u1
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
167 U = min(u1, u2)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
168 #print u1,R1/n1,R2/n2
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
169
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
170 mU = n1 * n2 / 2.0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
171 sigmaU = sqrt((n1 * n2 * (n1 + n2 + 1))/12.0)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
172 return u1, R1/n1,R2/n2, (U-mU)/sigmaU
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
173