comparison ngram.py @ 0:fee51ab07d09

blanket publication of all existing python files in lib/python on maritain
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 09 Mar 2020 14:58:04 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:fee51ab07d09
1 # Natural Language Toolkit: Language Models
2 #
3 # Copyright (C) 2001-2009 NLTK Project
4 # Author: Steven Bird <sb@csse.unimelb.edu.au>
5 # URL: <http://www.nltk.org/>
6 # For license information, see LICENSE.TXT
7
8 import random, types
9 from itertools import chain
10 from math import log
11
12 from nltk.probability import (ConditionalProbDist, ConditionalFreqDist,
13 MLEProbDist, FreqDist)
14 try:
15 from nltk.util import ingrams
16 except:
17 from nltkx.util import ingrams
18
19 from api import *
20
21 class NgramModel(ModelI):
22 """
23 A processing interface for assigning a probability to the next word.
24 """
25
26 def __init__(self, n, train, pad_left=False, pad_right=False,
27 estimator=None, *estimator_args, **estimator_kwargs):
28 """
29 Creates an ngram language model to capture patterns in n consecutive
30 words of training text. An estimator smooths the probabilities derived
31 from the text and may allow generation of ngrams not seen during
32 training.
33
34 @param n: the order of the language model (ngram size)
35 @type n: C{int}
36 @param train: the training text
37 @type train: C{list} of C{list} of C{string}
38 @param estimator: a function for generating a probability distribution
39 @type estimator: a function that takes a C{ConditionalFreqDist} and
40 returns a C{ConditionalProbDist}
41 @param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
42 @type pad_left: bool
43 @param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
44 @type pad_right: bool
45 @param estimator_args: Extra arguments for estimator.
46 These arguments are usually used to specify extra
47 properties for the probability distributions of individual
48 conditions, such as the number of bins they contain.
49 Note: For backward-compatibility, if no arguments are specified, the
50 number of bins in the underlying ConditionalFreqDist are passed to
51 the estimator as an argument.
52 @type estimator_args: (any)
53 @param estimator_kwargs: Extra keyword arguments for the estimator
54 @type estimator_kwargs: (any)
55 """
56 # protection from cryptic behavior for calling programs
57 # that use the pre-2.0.2 interface
58 assert(isinstance(pad_left, bool))
59 assert(isinstance(pad_right, bool))
60
61 self._n = n
62 self._W = len(train)
63 self._lpad = ('<s>',) * (n - 1) if pad_left else ()
64 # Need _rpad even for unigrams or padded entropy will give
65 # wrong answer because '' will be treated as unseen...
66 self._rpad = ('</s>',) * (max(1,(n - 1))) if pad_right else ()
67 self._padLen = len(self._lpad)+len(self._rpad)
68
69 self._N=0
70 delta = 1+self._padLen-n # len(sent)+delta == ngrams in sent
71
72 if estimator is None:
73 assert (estimator_args is None) and (estimator_kwargs is None),\
74 "estimator_args or _kwargs supplied, but no estimator"
75 estimator = lambda fdist, bins: MLEProbDist(fdist)
76
77 # Given backoff, a generator isn't acceptable
78 if isinstance(train,types.GeneratorType):
79 train=list(train)
80
81 if n == 1:
82 if pad_right:
83 sents=(chain(s,self._rpad) for s in train)
84 else:
85 sents=train
86 fd=FreqDist()
87 for s in sents:
88 fd.update(s)
89 if not estimator_args and not estimator_kwargs:
90 self._model = estimator(fd,fd.B())
91 else:
92 self._model = estimator(fd,fd.B(),
93 *estimator_args, **estimator_kwargs)
94 self._N=fd.N()
95 else:
96 cfd = ConditionalFreqDist()
97 self._ngrams = set()
98
99 for sent in train:
100 self._N+=len(sent)+delta
101 for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
102 self._ngrams.add(ngram)
103 context = tuple(ngram[:-1])
104 token = ngram[-1]
105 cfd[context][token]+=1
106 if not estimator_args and not estimator_kwargs:
107 self._model = ConditionalProbDist(cfd, estimator, len(cfd))
108 else:
109 self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)
110
111 # recursively construct the lower-order models
112 if n > 1:
113 self._backoff = NgramModel(n-1, train, pad_left, pad_right,
114 estimator, *estimator_args, **estimator_kwargs)
115
116 # Code below here in this method, and the _words_following and _alpha method, are from
117 # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015"
118 self._backoff_alphas = dict()
119 # For each condition (or context)
120 #print cfd,cfd.conditions()
121 for ctxt in cfd.conditions():
122 backoff_ctxt = ctxt[1:]
123 backoff_total_pr = 0.0
124 total_observed_pr = 0.0
125
126 # this is the subset of words that we OBSERVED following
127 # this context.
128 # i.e. Count(word | context) > 0
129 wf=list(self._words_following(ctxt, cfd))
130 for word in self._words_following(ctxt, cfd):
131 total_observed_pr += self.prob(word, ctxt)
132 # we also need the total (n-1)-gram probability of
133 # words observed in this n-gram context
134 backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
135 assert (0 <= total_observed_pr <= 1),\
136 "sum of probs for %s out of bounds: %s"%(ctxt,total_observed_pr)
137 # beta is the remaining probability weight after we factor out
138 # the probability of observed words.
139 # As a sanity check, both total_observed_pr and backoff_total_pr
140 # must be GE 0, since probabilities are never negative
141 beta = 1.0 - total_observed_pr
142
143 # if backoff total is 1, that should mean that all samples occur in this context,
144 # so we will never back off.
145 # Greater than 1 is an error.
146 assert (0 <= backoff_total_pr < 1), \
147 "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
148 alpha_ctxt = beta / (1.0 - backoff_total_pr)
149
150 self._backoff_alphas[ctxt] = alpha_ctxt
151
152 def _words_following(self, context, cond_freq_dist):
153 return cond_freq_dist[context].iterkeys()
154 # below from http://www.nltk.org/_modules/nltk/model/ngram.html,
155 # depends on new CFD???
156 #for ctxt, word in cond_freq_dist.iterkeys():
157 # if ctxt == context:
158 # yield word
159
160 def prob(self, word, context, verbose=False):
161 """
162 Evaluate the probability of this word in this context
163 using Katz Backoff.
164 """
165 assert(isinstance(word,types.StringTypes))
166 context = tuple(context)
167 if self._n==1:
168 if not(self._model.SUM_TO_ONE):
169 # Smoothing models should do the right thing for unigrams
170 # even if they're 'absent'
171 return self._model.prob(word)
172 else:
173 try:
174 return self._model.prob(word)
175 except:
176 raise RuntimeError("No probability mass assigned"
177 "to unigram %s" % (word))
178 if context + (word,) in self._ngrams:
179 return self[context].prob(word)
180 else:
181 alpha=self._alpha(context)
182 if alpha>0:
183 if verbose:
184 print "backing off for %s"%(context+(word,),)
185 return alpha * self._backoff.prob(word, context[1:],verbose)
186 else:
187 if verbose:
188 print "no backoff for %s as model doesn't do any smoothing"%word
189 return alpha
190
191 def _alpha(self, context,verbose=False):
192 """Get the backoff alpha value for the given context
193 """
194 error_message = "Alphas and backoff are not defined for unigram models"
195 assert (not self._n == 1), error_message
196
197 if context in self._backoff_alphas:
198 res = self._backoff_alphas[context]
199 else:
200 res = 1
201 if verbose:
202 print " alpha: %s = %s"%(context,res)
203 return res
204
205
206 def logprob(self, word, context,verbose=False):
207 """
208 Evaluate the (negative) log probability of this word in this context.
209 """
210
211 return -log(self.prob(word, context,verbose), 2)
212
213 # NB, this will always start with same word since model
214 # is trained on a single text
215 def generate(self, num_words, context=()):
216 '''Generate random text based on the language model.'''
217 text = list(context)
218 for i in range(num_words):
219 text.append(self._generate_one(text))
220 return text
221
222 def _generate_one(self, context):
223 context = (self._prefix + tuple(context))[-self._n+1:]
224 # print "Context (%d): <%s>" % (self._n, ','.join(context))
225 if context in self:
226 return self[context].generate()
227 elif self._n > 1:
228 return self._backoff._generate_one(context[1:])
229 else:
230 return '.'
231
232 def entropy(self, text, pad_left=False, pad_right=False,
233 verbose=False, perItem=False):
234 """
235 Evaluate the total entropy of a text with respect to the model.
236 This is the sum of the log probability of each word in the message.
237 """
238 # This version takes account of padding for greater accuracy
239 e = 0.0
240 for ngram in ngrams(chain(self._lpad, text, self._rpad), self._n):
241 context = tuple(ngram[:-1])
242 token = ngram[-1]
243 cost=self.logprob(token, context, verbose) # _negative_
244 # log2 prob == cost!
245 if verbose:
246 print "p(%s|%s) = [%s-gram] %7f"%(token,context,self._n,2**-cost)
247 e += cost
248 if perItem:
249 return e/((len(text)+self._padLen)-(self._n - 1))
250 else:
251 return e
252
253 def dump(self, file, logBase=None, precision=7):
254 """Dump this model in SRILM/ARPA/Doug Paul format
255
256 Use logBase=10 and the default precision to get something comparable
257 to SRILM ngram-model -lm output
258 @param file to dump to
259 @type file file
260 @param logBase If not None, output logBases to the specified base
261 @type logBase int|None"""
262 file.write('\n\\data\\\n')
263 self._writeLens(file)
264 self._writeModels(file,logBase,precision,None)
265 file.write('\\end\\\n')
266
267 def _writeLens(self,file):
268 if self._n>1:
269 self._backoff._writeLens(file)
270 file.write('ngram %s=%s\n'%(self._n,
271 sum(len(self._model[c].samples())\
272 for c in self._model.keys())))
273 else:
274 file.write('ngram 1=%s\n'%len(self._model.samples()))
275
276
277 def _writeModels(self,file,logBase,precision,alphas):
278 if self._n>1:
279 self._backoff._writeModels(file,logBase,precision,self._backoff_alphas)
280 file.write('\n\\%s-grams:\n'%self._n)
281 if self._n==1:
282 self._writeProbs(self._model,file,logBase,precision,(),alphas)
283 else:
284 for c in sorted(self._model.conditions()):
285 self._writeProbs(self._model[c],file,logBase,precision,
286 c,alphas)
287
288 def _writeProbs(self,pd,file,logBase,precision,ctxt,alphas):
289 if self._n==1:
290 for k in sorted(pd.samples()+['<unk>','<s>']):
291 if k=='<s>':
292 file.write('-99')
293 elif k=='<unk>':
294 _writeProb(file,logBase,precision,1-pd.discount())
295 else:
296 _writeProb(file,logBase,precision,pd.prob(k))
297 file.write('\t%s'%k)
298 if k not in ('</s>','<unk>'):
299 file.write('\t')
300 _writeProb(file,logBase,precision,alphas[ctxt+(k,)])
301 file.write('\n')
302 else:
303 ctxtString=' '.join(ctxt)
304 for k in sorted(pd.samples()):
305 _writeProb(file,logBase,precision,pd.prob(k))
306 file.write('\t%s %s'%(ctxtString,k))
307 if alphas is not None:
308 file.write('\t')
309 _writeProb(file,logBase,precision,alphas[ctxt+(k,)])
310 file.write('\n')
311
312 def __contains__(self, item):
313 try:
314 return item in self._model
315 except:
316 try:
317 # hack if model is an MLEProbDist, more efficient
318 return item in self._model._freqdist
319 except:
320 return item in self._model.samples()
321
322 def __getitem__(self, item):
323 return self._model[item]
324
325 def __repr__(self):
326 return '<NgramModel with %d %d-grams>' % (self._N, self._n)
327
328 def _writeProb(file,logBase,precision,p):
329 file.write('%.*g'%(precision,
330 p if logBase is None else log(p,logBase)))
331
332 def demo():
333 from nltk.corpus import brown
334 from nltk.probability import LidstoneProbDist, WittenBellProbDist
335 estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
336 # estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
337 lm = NgramModel(3, brown.words(categories='news'), estimator)
338 print lm
339 # print lm.entropy(sent)
340 text = lm.generate(100)
341 import textwrap
342 print '\n'.join(textwrap.wrap(' '.join(text)))
343
344 if __name__ == '__main__':
345 demo()