Mercurial > hg > python
comparison ngram.py @ 0:fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 09 Mar 2020 14:58:04 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fee51ab07d09 |
---|---|
1 # Natural Language Toolkit: Language Models | |
2 # | |
3 # Copyright (C) 2001-2009 NLTK Project | |
4 # Author: Steven Bird <sb@csse.unimelb.edu.au> | |
5 # URL: <http://www.nltk.org/> | |
6 # For license information, see LICENSE.TXT | |
7 | |
8 import random, types | |
9 from itertools import chain | |
10 from math import log | |
11 | |
12 from nltk.probability import (ConditionalProbDist, ConditionalFreqDist, | |
13 MLEProbDist, FreqDist) | |
14 try: | |
15 from nltk.util import ingrams | |
16 except: | |
17 from nltkx.util import ingrams | |
18 | |
19 from api import * | |
20 | |
21 class NgramModel(ModelI): | |
22 """ | |
23 A processing interface for assigning a probability to the next word. | |
24 """ | |
25 | |
26 def __init__(self, n, train, pad_left=False, pad_right=False, | |
27 estimator=None, *estimator_args, **estimator_kwargs): | |
28 """ | |
29 Creates an ngram language model to capture patterns in n consecutive | |
30 words of training text. An estimator smooths the probabilities derived | |
31 from the text and may allow generation of ngrams not seen during | |
32 training. | |
33 | |
34 @param n: the order of the language model (ngram size) | |
35 @type n: C{int} | |
36 @param train: the training text | |
37 @type train: C{list} of C{list} of C{string} | |
38 @param estimator: a function for generating a probability distribution | |
39 @type estimator: a function that takes a C{ConditionalFreqDist} and | |
40 returns a C{ConditionalProbDist} | |
41 @param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings | |
42 @type pad_left: bool | |
43 @param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings | |
44 @type pad_right: bool | |
45 @param estimator_args: Extra arguments for estimator. | |
46 These arguments are usually used to specify extra | |
47 properties for the probability distributions of individual | |
48 conditions, such as the number of bins they contain. | |
49 Note: For backward-compatibility, if no arguments are specified, the | |
50 number of bins in the underlying ConditionalFreqDist are passed to | |
51 the estimator as an argument. | |
52 @type estimator_args: (any) | |
53 @param estimator_kwargs: Extra keyword arguments for the estimator | |
54 @type estimator_kwargs: (any) | |
55 """ | |
56 # protection from cryptic behavior for calling programs | |
57 # that use the pre-2.0.2 interface | |
58 assert(isinstance(pad_left, bool)) | |
59 assert(isinstance(pad_right, bool)) | |
60 | |
61 self._n = n | |
62 self._W = len(train) | |
63 self._lpad = ('<s>',) * (n - 1) if pad_left else () | |
64 # Need _rpad even for unigrams or padded entropy will give | |
65 # wrong answer because '' will be treated as unseen... | |
66 self._rpad = ('</s>',) * (max(1,(n - 1))) if pad_right else () | |
67 self._padLen = len(self._lpad)+len(self._rpad) | |
68 | |
69 self._N=0 | |
70 delta = 1+self._padLen-n # len(sent)+delta == ngrams in sent | |
71 | |
72 if estimator is None: | |
73 assert (estimator_args is None) and (estimator_kwargs is None),\ | |
74 "estimator_args or _kwargs supplied, but no estimator" | |
75 estimator = lambda fdist, bins: MLEProbDist(fdist) | |
76 | |
77 # Given backoff, a generator isn't acceptable | |
78 if isinstance(train,types.GeneratorType): | |
79 train=list(train) | |
80 | |
81 if n == 1: | |
82 if pad_right: | |
83 sents=(chain(s,self._rpad) for s in train) | |
84 else: | |
85 sents=train | |
86 fd=FreqDist() | |
87 for s in sents: | |
88 fd.update(s) | |
89 if not estimator_args and not estimator_kwargs: | |
90 self._model = estimator(fd,fd.B()) | |
91 else: | |
92 self._model = estimator(fd,fd.B(), | |
93 *estimator_args, **estimator_kwargs) | |
94 self._N=fd.N() | |
95 else: | |
96 cfd = ConditionalFreqDist() | |
97 self._ngrams = set() | |
98 | |
99 for sent in train: | |
100 self._N+=len(sent)+delta | |
101 for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): | |
102 self._ngrams.add(ngram) | |
103 context = tuple(ngram[:-1]) | |
104 token = ngram[-1] | |
105 cfd[context][token]+=1 | |
106 if not estimator_args and not estimator_kwargs: | |
107 self._model = ConditionalProbDist(cfd, estimator, len(cfd)) | |
108 else: | |
109 self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) | |
110 | |
111 # recursively construct the lower-order models | |
112 if n > 1: | |
113 self._backoff = NgramModel(n-1, train, pad_left, pad_right, | |
114 estimator, *estimator_args, **estimator_kwargs) | |
115 | |
116 # Code below here in this method, and the _words_following and _alpha method, are from | |
117 # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015" | |
118 self._backoff_alphas = dict() | |
119 # For each condition (or context) | |
120 #print cfd,cfd.conditions() | |
121 for ctxt in cfd.conditions(): | |
122 backoff_ctxt = ctxt[1:] | |
123 backoff_total_pr = 0.0 | |
124 total_observed_pr = 0.0 | |
125 | |
126 # this is the subset of words that we OBSERVED following | |
127 # this context. | |
128 # i.e. Count(word | context) > 0 | |
129 wf=list(self._words_following(ctxt, cfd)) | |
130 for word in self._words_following(ctxt, cfd): | |
131 total_observed_pr += self.prob(word, ctxt) | |
132 # we also need the total (n-1)-gram probability of | |
133 # words observed in this n-gram context | |
134 backoff_total_pr += self._backoff.prob(word, backoff_ctxt) | |
135 assert (0 <= total_observed_pr <= 1),\ | |
136 "sum of probs for %s out of bounds: %s"%(ctxt,total_observed_pr) | |
137 # beta is the remaining probability weight after we factor out | |
138 # the probability of observed words. | |
139 # As a sanity check, both total_observed_pr and backoff_total_pr | |
140 # must be GE 0, since probabilities are never negative | |
141 beta = 1.0 - total_observed_pr | |
142 | |
143 # if backoff total is 1, that should mean that all samples occur in this context, | |
144 # so we will never back off. | |
145 # Greater than 1 is an error. | |
146 assert (0 <= backoff_total_pr < 1), \ | |
147 "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr) | |
148 alpha_ctxt = beta / (1.0 - backoff_total_pr) | |
149 | |
150 self._backoff_alphas[ctxt] = alpha_ctxt | |
151 | |
152 def _words_following(self, context, cond_freq_dist): | |
153 return cond_freq_dist[context].iterkeys() | |
154 # below from http://www.nltk.org/_modules/nltk/model/ngram.html, | |
155 # depends on new CFD??? | |
156 #for ctxt, word in cond_freq_dist.iterkeys(): | |
157 # if ctxt == context: | |
158 # yield word | |
159 | |
160 def prob(self, word, context, verbose=False): | |
161 """ | |
162 Evaluate the probability of this word in this context | |
163 using Katz Backoff. | |
164 """ | |
165 assert(isinstance(word,types.StringTypes)) | |
166 context = tuple(context) | |
167 if self._n==1: | |
168 if not(self._model.SUM_TO_ONE): | |
169 # Smoothing models should do the right thing for unigrams | |
170 # even if they're 'absent' | |
171 return self._model.prob(word) | |
172 else: | |
173 try: | |
174 return self._model.prob(word) | |
175 except: | |
176 raise RuntimeError("No probability mass assigned" | |
177 "to unigram %s" % (word)) | |
178 if context + (word,) in self._ngrams: | |
179 return self[context].prob(word) | |
180 else: | |
181 alpha=self._alpha(context) | |
182 if alpha>0: | |
183 if verbose: | |
184 print "backing off for %s"%(context+(word,),) | |
185 return alpha * self._backoff.prob(word, context[1:],verbose) | |
186 else: | |
187 if verbose: | |
188 print "no backoff for %s as model doesn't do any smoothing"%word | |
189 return alpha | |
190 | |
191 def _alpha(self, context,verbose=False): | |
192 """Get the backoff alpha value for the given context | |
193 """ | |
194 error_message = "Alphas and backoff are not defined for unigram models" | |
195 assert (not self._n == 1), error_message | |
196 | |
197 if context in self._backoff_alphas: | |
198 res = self._backoff_alphas[context] | |
199 else: | |
200 res = 1 | |
201 if verbose: | |
202 print " alpha: %s = %s"%(context,res) | |
203 return res | |
204 | |
205 | |
206 def logprob(self, word, context,verbose=False): | |
207 """ | |
208 Evaluate the (negative) log probability of this word in this context. | |
209 """ | |
210 | |
211 return -log(self.prob(word, context,verbose), 2) | |
212 | |
213 # NB, this will always start with same word since model | |
214 # is trained on a single text | |
215 def generate(self, num_words, context=()): | |
216 '''Generate random text based on the language model.''' | |
217 text = list(context) | |
218 for i in range(num_words): | |
219 text.append(self._generate_one(text)) | |
220 return text | |
221 | |
222 def _generate_one(self, context): | |
223 context = (self._prefix + tuple(context))[-self._n+1:] | |
224 # print "Context (%d): <%s>" % (self._n, ','.join(context)) | |
225 if context in self: | |
226 return self[context].generate() | |
227 elif self._n > 1: | |
228 return self._backoff._generate_one(context[1:]) | |
229 else: | |
230 return '.' | |
231 | |
232 def entropy(self, text, pad_left=False, pad_right=False, | |
233 verbose=False, perItem=False): | |
234 """ | |
235 Evaluate the total entropy of a text with respect to the model. | |
236 This is the sum of the log probability of each word in the message. | |
237 """ | |
238 # This version takes account of padding for greater accuracy | |
239 e = 0.0 | |
240 for ngram in ngrams(chain(self._lpad, text, self._rpad), self._n): | |
241 context = tuple(ngram[:-1]) | |
242 token = ngram[-1] | |
243 cost=self.logprob(token, context, verbose) # _negative_ | |
244 # log2 prob == cost! | |
245 if verbose: | |
246 print "p(%s|%s) = [%s-gram] %7f"%(token,context,self._n,2**-cost) | |
247 e += cost | |
248 if perItem: | |
249 return e/((len(text)+self._padLen)-(self._n - 1)) | |
250 else: | |
251 return e | |
252 | |
253 def dump(self, file, logBase=None, precision=7): | |
254 """Dump this model in SRILM/ARPA/Doug Paul format | |
255 | |
256 Use logBase=10 and the default precision to get something comparable | |
257 to SRILM ngram-model -lm output | |
258 @param file to dump to | |
259 @type file file | |
260 @param logBase If not None, output logBases to the specified base | |
261 @type logBase int|None""" | |
262 file.write('\n\\data\\\n') | |
263 self._writeLens(file) | |
264 self._writeModels(file,logBase,precision,None) | |
265 file.write('\\end\\\n') | |
266 | |
267 def _writeLens(self,file): | |
268 if self._n>1: | |
269 self._backoff._writeLens(file) | |
270 file.write('ngram %s=%s\n'%(self._n, | |
271 sum(len(self._model[c].samples())\ | |
272 for c in self._model.keys()))) | |
273 else: | |
274 file.write('ngram 1=%s\n'%len(self._model.samples())) | |
275 | |
276 | |
277 def _writeModels(self,file,logBase,precision,alphas): | |
278 if self._n>1: | |
279 self._backoff._writeModels(file,logBase,precision,self._backoff_alphas) | |
280 file.write('\n\\%s-grams:\n'%self._n) | |
281 if self._n==1: | |
282 self._writeProbs(self._model,file,logBase,precision,(),alphas) | |
283 else: | |
284 for c in sorted(self._model.conditions()): | |
285 self._writeProbs(self._model[c],file,logBase,precision, | |
286 c,alphas) | |
287 | |
288 def _writeProbs(self,pd,file,logBase,precision,ctxt,alphas): | |
289 if self._n==1: | |
290 for k in sorted(pd.samples()+['<unk>','<s>']): | |
291 if k=='<s>': | |
292 file.write('-99') | |
293 elif k=='<unk>': | |
294 _writeProb(file,logBase,precision,1-pd.discount()) | |
295 else: | |
296 _writeProb(file,logBase,precision,pd.prob(k)) | |
297 file.write('\t%s'%k) | |
298 if k not in ('</s>','<unk>'): | |
299 file.write('\t') | |
300 _writeProb(file,logBase,precision,alphas[ctxt+(k,)]) | |
301 file.write('\n') | |
302 else: | |
303 ctxtString=' '.join(ctxt) | |
304 for k in sorted(pd.samples()): | |
305 _writeProb(file,logBase,precision,pd.prob(k)) | |
306 file.write('\t%s %s'%(ctxtString,k)) | |
307 if alphas is not None: | |
308 file.write('\t') | |
309 _writeProb(file,logBase,precision,alphas[ctxt+(k,)]) | |
310 file.write('\n') | |
311 | |
312 def __contains__(self, item): | |
313 try: | |
314 return item in self._model | |
315 except: | |
316 try: | |
317 # hack if model is an MLEProbDist, more efficient | |
318 return item in self._model._freqdist | |
319 except: | |
320 return item in self._model.samples() | |
321 | |
322 def __getitem__(self, item): | |
323 return self._model[item] | |
324 | |
325 def __repr__(self): | |
326 return '<NgramModel with %d %d-grams>' % (self._N, self._n) | |
327 | |
328 def _writeProb(file,logBase,precision,p): | |
329 file.write('%.*g'%(precision, | |
330 p if logBase is None else log(p,logBase))) | |
331 | |
332 def demo(): | |
333 from nltk.corpus import brown | |
334 from nltk.probability import LidstoneProbDist, WittenBellProbDist | |
335 estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) | |
336 # estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) | |
337 lm = NgramModel(3, brown.words(categories='news'), estimator) | |
338 print lm | |
339 # print lm.entropy(sent) | |
340 text = lm.generate(100) | |
341 import textwrap | |
342 print '\n'.join(textwrap.wrap(' '.join(text))) | |
343 | |
344 if __name__ == '__main__': | |
345 demo() |