comparison ngram_3.0.py @ 0:fee51ab07d09

blanket publication of all existing python files in lib/python on maritain
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 09 Mar 2020 14:58:04 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:fee51ab07d09
1 # Natural Language Toolkit: Language Models
2 #
3 # Copyright (C) 2001-2014 NLTK Project
4 # Authors: Steven Bird <stevenbird1@gmail.com>
5 # Daniel Blanchard <dblanchard@ets.org>
6 # Ilia Kurenkov <ilia.kurenkov@gmail.com>
7 # URL: <http://nltk.org/>
8 # For license information, see LICENSE.TXT
9 ######## Copied from http://www.nltk.org/_modules/nltk/model/ngram.html 2017-01-14
10 ######## Not actually part of 3.0 release
11 ######## "© Copyright 2015, NLTK Project. Last updated on Feb 26, 2015. Created using Sphinx 1.2.3"
12 from __future__ import unicode_literals
13
14 from itertools import chain
15 from math import log
16
17 from nltk.probability import (FreqDist,
18 ConditionalProbDist,
19 ConditionalFreqDist,
20 LidstoneProbDist)
21 from nltk.util import ngrams
22 from nltk.model.api import ModelI
23
24 from nltk import compat
25
26
27 def _estimator(fdist, *estimator_args, **estimator_kwargs):
28 """
29 Default estimator function using a SimpleGoodTuringProbDist.
30 """
31 # can't be an instance method of NgramModel as they
32 # can't be pickled either.
33 return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs)
34
35
36 @compat.python_2_unicode_compatible
37 class NgramModel(ModelI):
38 """
39 A processing interface for assigning a probability to the next word.
40 """
41
42 def __init__(self, n, train, pad_left=True, pad_right=False,
43 estimator=None, *estimator_args, **estimator_kwargs):
44 """
45 Create an ngram language model to capture patterns in n consecutive
46 words of training text. An estimator smooths the probabilities derived
47 from the text and may allow generation of ngrams not seen during
48 training.
49
50 >>> from nltk.corpus import brown
51 >>> from nltk.probability import LidstoneProbDist
52 >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
53 >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
54 >>> lm
55 <NgramModel with 91603 3-grams>
56 >>> lm._backoff
57 <NgramModel with 62888 2-grams>
58 >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
59 ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
60 ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
61 ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
62 ... # doctest: +ELLIPSIS
63 0.5776...
64
65 :param n: the order of the language model (ngram size)
66 :type n: int
67 :param train: the training text
68 :type train: list(str) or list(list(str))
69 :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
70 :type pad_left: bool
71 :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
72 :type pad_right: bool
73 :param estimator: a function for generating a probability distribution
74 :type estimator: a function that takes a ConditionalFreqDist and
75 returns a ConditionalProbDist
76 :param estimator_args: Extra arguments for estimator.
77 These arguments are usually used to specify extra
78 properties for the probability distributions of individual
79 conditions, such as the number of bins they contain.
80 Note: For backward-compatibility, if no arguments are specified, the
81 number of bins in the underlying ConditionalFreqDist are passed to
82 the estimator as an argument.
83 :type estimator_args: (any)
84 :param estimator_kwargs: Extra keyword arguments for the estimator
85 :type estimator_kwargs: (any)
86 """
87
88 # protection from cryptic behavior for calling programs
89 # that use the pre-2.0.2 interface
90 assert(isinstance(pad_left, bool))
91 assert(isinstance(pad_right, bool))
92
93 # make sure n is greater than zero, otherwise print it
94 assert (n > 0), n
95
96 # For explicitness save the check whether this is a unigram model
97 self.is_unigram_model = (n == 1)
98 # save the ngram order number
99 self._n = n
100 # save left and right padding
101 self._lpad = ('',) * (n - 1) if pad_left else ()
102 self._rpad = ('',) * (n - 1) if pad_right else ()
103
104 if estimator is None:
105 estimator = _estimator
106
107 cfd = ConditionalFreqDist()
108
109 # set read-only ngrams set (see property declaration below to reconfigure)
110 self._ngrams = set()
111
112 # If given a list of strings instead of a list of lists, create enclosing list
113 if (train is not None) and isinstance(train[0], compat.string_types):
114 train = [train]
115
116 for sent in train:
117 raw_ngrams = ngrams(sent, n, pad_left, pad_right, pad_symbol='')
118 for ngram in raw_ngrams:
119 self._ngrams.add(ngram)
120 context = tuple(ngram[:-1])
121 token = ngram[-1]
122 cfd[(context, token)] += 1
123
124 self._probdist = estimator(cfd, *estimator_args, **estimator_kwargs)
125
126 # recursively construct the lower-order models
127 if not self.is_unigram_model:
128 self._backoff = NgramModel(n-1, train,
129 pad_left, pad_right,
130 estimator,
131 *estimator_args,
132 **estimator_kwargs)
133
134 self._backoff_alphas = dict()
135 # For each condition (or context)
136 for ctxt in cfd.conditions():
137 backoff_ctxt = ctxt[1:]
138 backoff_total_pr = 0.0
139 total_observed_pr = 0.0
140
141 # this is the subset of words that we OBSERVED following
142 # this context.
143 # i.e. Count(word | context) > 0
144 for word in self._words_following(ctxt, cfd):
145 total_observed_pr += self.prob(word, ctxt)
146 # we also need the total (n-1)-gram probability of
147 # words observed in this n-gram context
148 backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
149
150 assert (0 <= total_observed_pr <= 1), total_observed_pr
151 # beta is the remaining probability weight after we factor out
152 # the probability of observed words.
153 # As a sanity check, both total_observed_pr and backoff_total_pr
154 # must be GE 0, since probabilities are never negative
155 beta = 1.0 - total_observed_pr
156
157 # backoff total has to be less than one, otherwise we get
158 # an error when we try subtracting it from 1 in the denominator
159 assert (0 <= backoff_total_pr < 1), backoff_total_pr
160 alpha_ctxt = beta / (1.0 - backoff_total_pr)
161
162 self._backoff_alphas[ctxt] = alpha_ctxt
163
164 def _words_following(self, context, cond_freq_dist):
165 for ctxt, word in cond_freq_dist.iterkeys():
166 if ctxt == context:
167 yield word
168
169 def prob(self, word, context):
170 """
171 Evaluate the probability of this word in this context using Katz Backoff.
172
173 :param word: the word to get the probability of
174 :type word: str
175 :param context: the context the word is in
176 :type context: list(str)
177 """
178 context = tuple(context)
179 if (context + (word,) in self._ngrams) or (self.is_unigram_model):
180 return self._probdist.prob((context, word))
181 else:
182 return self._alpha(context) * self._backoff.prob(word, context[1:])
183
184 def _alpha(self, context):
185 """Get the backoff alpha value for the given context
186 """
187 error_message = "Alphas and backoff are not defined for unigram models"
188 assert not self.is_unigram_model, error_message
189
190 if context in self._backoff_alphas:
191 return self._backoff_alphas[context]
192 else:
193 return 1
194
195 def logprob(self, word, context):
196 """
197 Evaluate the (negative) log probability of this word in this context.
198
199 :param word: the word to get the probability of
200 :type word: str
201 :param context: the context the word is in
202 :type context: list(str)
203 """
204 return -log(self.prob(word, context), 2)
205
206 @property
207 def ngrams(self):
208 return self._ngrams
209
210 @property
211 def backoff(self):
212 return self._backoff
213
214 @property
215 def probdist(self):
216 return self._probdist
217
218 def choose_random_word(self, context):
219 '''
220 Randomly select a word that is likely to appear in this context.
221
222 :param context: the context the word is in
223 :type context: list(str)
224 '''
225
226 return self.generate(1, context)[-1]
227
228 # NB, this will always start with same word if the model
229 # was trained on a single text
230
231 def generate(self, num_words, context=()):
232 '''
233 Generate random text based on the language model.
234
235 :param num_words: number of words to generate
236 :type num_words: int
237 :param context: initial words in generated string
238 :type context: list(str)
239 '''
240
241 text = list(context)
242 for i in range(num_words):
243 text.append(self._generate_one(text))
244 return text
245
246 def _generate_one(self, context):
247 context = (self._lpad + tuple(context))[- self._n + 1:]
248 if context in self:
249 return self[context].generate()
250 elif self._n > 1:
251 return self._backoff._generate_one(context[1:])
252 else:
253 return '.'
254
255 def entropy(self, text):
256 """
257 Calculate the approximate cross-entropy of the n-gram model for a
258 given evaluation text.
259 This is the average log probability of each word in the text.
260
261 :param text: words to use for evaluation
262 :type text: list(str)
263 """
264
265 e = 0.0
266 text = list(self._lpad) + text + list(self._rpad)
267 for i in range(self._n - 1, len(text)):
268 context = tuple(text[i - self._n + 1:i])
269 token = text[i]
270 e += self.logprob(token, context)
271 return e / float(len(text) - (self._n - 1))
272
273 def perplexity(self, text):
274 """
275 Calculates the perplexity of the given text.
276 This is simply 2 ** cross-entropy for the text.
277
278 :param text: words to calculate perplexity of
279 :type text: list(str)
280 """
281
282 return pow(2.0, self.entropy(text))
283
284 def __contains__(self, item):
285 return tuple(item) in self._probdist.freqdist
286
287 def __getitem__(self, item):
288 return self._probdist[tuple(item)]
289
290 def __repr__(self):
291 return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
292
293
294 def teardown_module(module=None):
295 from nltk.corpus import brown
296 brown._unload()
297
298 if __name__ == "__main__":
299 import doctest
300 doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)