Mercurial > hg > python
comparison ngram_3.0.py @ 0:fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 09 Mar 2020 14:58:04 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fee51ab07d09 |
---|---|
1 # Natural Language Toolkit: Language Models | |
2 # | |
3 # Copyright (C) 2001-2014 NLTK Project | |
4 # Authors: Steven Bird <stevenbird1@gmail.com> | |
5 # Daniel Blanchard <dblanchard@ets.org> | |
6 # Ilia Kurenkov <ilia.kurenkov@gmail.com> | |
7 # URL: <http://nltk.org/> | |
8 # For license information, see LICENSE.TXT | |
9 ######## Copied from http://www.nltk.org/_modules/nltk/model/ngram.html 2017-01-14 | |
10 ######## Not actually part of 3.0 release | |
11 ######## "© Copyright 2015, NLTK Project. Last updated on Feb 26, 2015. Created using Sphinx 1.2.3" | |
12 from __future__ import unicode_literals | |
13 | |
14 from itertools import chain | |
15 from math import log | |
16 | |
17 from nltk.probability import (FreqDist, | |
18 ConditionalProbDist, | |
19 ConditionalFreqDist, | |
20 LidstoneProbDist) | |
21 from nltk.util import ngrams | |
22 from nltk.model.api import ModelI | |
23 | |
24 from nltk import compat | |
25 | |
26 | |
27 def _estimator(fdist, *estimator_args, **estimator_kwargs): | |
28 """ | |
29 Default estimator function using a SimpleGoodTuringProbDist. | |
30 """ | |
31 # can't be an instance method of NgramModel as they | |
32 # can't be pickled either. | |
33 return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs) | |
34 | |
35 | |
36 @compat.python_2_unicode_compatible | |
37 class NgramModel(ModelI): | |
38 """ | |
39 A processing interface for assigning a probability to the next word. | |
40 """ | |
41 | |
42 def __init__(self, n, train, pad_left=True, pad_right=False, | |
43 estimator=None, *estimator_args, **estimator_kwargs): | |
44 """ | |
45 Create an ngram language model to capture patterns in n consecutive | |
46 words of training text. An estimator smooths the probabilities derived | |
47 from the text and may allow generation of ngrams not seen during | |
48 training. | |
49 | |
50 >>> from nltk.corpus import brown | |
51 >>> from nltk.probability import LidstoneProbDist | |
52 >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) | |
53 >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est) | |
54 >>> lm | |
55 <NgramModel with 91603 3-grams> | |
56 >>> lm._backoff | |
57 <NgramModel with 62888 2-grams> | |
58 >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', | |
59 ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', | |
60 ... 'primary', 'election', 'produced', '``', 'no', 'evidence', | |
61 ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.']) | |
62 ... # doctest: +ELLIPSIS | |
63 0.5776... | |
64 | |
65 :param n: the order of the language model (ngram size) | |
66 :type n: int | |
67 :param train: the training text | |
68 :type train: list(str) or list(list(str)) | |
69 :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings | |
70 :type pad_left: bool | |
71 :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings | |
72 :type pad_right: bool | |
73 :param estimator: a function for generating a probability distribution | |
74 :type estimator: a function that takes a ConditionalFreqDist and | |
75 returns a ConditionalProbDist | |
76 :param estimator_args: Extra arguments for estimator. | |
77 These arguments are usually used to specify extra | |
78 properties for the probability distributions of individual | |
79 conditions, such as the number of bins they contain. | |
80 Note: For backward-compatibility, if no arguments are specified, the | |
81 number of bins in the underlying ConditionalFreqDist are passed to | |
82 the estimator as an argument. | |
83 :type estimator_args: (any) | |
84 :param estimator_kwargs: Extra keyword arguments for the estimator | |
85 :type estimator_kwargs: (any) | |
86 """ | |
87 | |
88 # protection from cryptic behavior for calling programs | |
89 # that use the pre-2.0.2 interface | |
90 assert(isinstance(pad_left, bool)) | |
91 assert(isinstance(pad_right, bool)) | |
92 | |
93 # make sure n is greater than zero, otherwise print it | |
94 assert (n > 0), n | |
95 | |
96 # For explicitness save the check whether this is a unigram model | |
97 self.is_unigram_model = (n == 1) | |
98 # save the ngram order number | |
99 self._n = n | |
100 # save left and right padding | |
101 self._lpad = ('',) * (n - 1) if pad_left else () | |
102 self._rpad = ('',) * (n - 1) if pad_right else () | |
103 | |
104 if estimator is None: | |
105 estimator = _estimator | |
106 | |
107 cfd = ConditionalFreqDist() | |
108 | |
109 # set read-only ngrams set (see property declaration below to reconfigure) | |
110 self._ngrams = set() | |
111 | |
112 # If given a list of strings instead of a list of lists, create enclosing list | |
113 if (train is not None) and isinstance(train[0], compat.string_types): | |
114 train = [train] | |
115 | |
116 for sent in train: | |
117 raw_ngrams = ngrams(sent, n, pad_left, pad_right, pad_symbol='') | |
118 for ngram in raw_ngrams: | |
119 self._ngrams.add(ngram) | |
120 context = tuple(ngram[:-1]) | |
121 token = ngram[-1] | |
122 cfd[(context, token)] += 1 | |
123 | |
124 self._probdist = estimator(cfd, *estimator_args, **estimator_kwargs) | |
125 | |
126 # recursively construct the lower-order models | |
127 if not self.is_unigram_model: | |
128 self._backoff = NgramModel(n-1, train, | |
129 pad_left, pad_right, | |
130 estimator, | |
131 *estimator_args, | |
132 **estimator_kwargs) | |
133 | |
134 self._backoff_alphas = dict() | |
135 # For each condition (or context) | |
136 for ctxt in cfd.conditions(): | |
137 backoff_ctxt = ctxt[1:] | |
138 backoff_total_pr = 0.0 | |
139 total_observed_pr = 0.0 | |
140 | |
141 # this is the subset of words that we OBSERVED following | |
142 # this context. | |
143 # i.e. Count(word | context) > 0 | |
144 for word in self._words_following(ctxt, cfd): | |
145 total_observed_pr += self.prob(word, ctxt) | |
146 # we also need the total (n-1)-gram probability of | |
147 # words observed in this n-gram context | |
148 backoff_total_pr += self._backoff.prob(word, backoff_ctxt) | |
149 | |
150 assert (0 <= total_observed_pr <= 1), total_observed_pr | |
151 # beta is the remaining probability weight after we factor out | |
152 # the probability of observed words. | |
153 # As a sanity check, both total_observed_pr and backoff_total_pr | |
154 # must be GE 0, since probabilities are never negative | |
155 beta = 1.0 - total_observed_pr | |
156 | |
157 # backoff total has to be less than one, otherwise we get | |
158 # an error when we try subtracting it from 1 in the denominator | |
159 assert (0 <= backoff_total_pr < 1), backoff_total_pr | |
160 alpha_ctxt = beta / (1.0 - backoff_total_pr) | |
161 | |
162 self._backoff_alphas[ctxt] = alpha_ctxt | |
163 | |
164 def _words_following(self, context, cond_freq_dist): | |
165 for ctxt, word in cond_freq_dist.iterkeys(): | |
166 if ctxt == context: | |
167 yield word | |
168 | |
169 def prob(self, word, context): | |
170 """ | |
171 Evaluate the probability of this word in this context using Katz Backoff. | |
172 | |
173 :param word: the word to get the probability of | |
174 :type word: str | |
175 :param context: the context the word is in | |
176 :type context: list(str) | |
177 """ | |
178 context = tuple(context) | |
179 if (context + (word,) in self._ngrams) or (self.is_unigram_model): | |
180 return self._probdist.prob((context, word)) | |
181 else: | |
182 return self._alpha(context) * self._backoff.prob(word, context[1:]) | |
183 | |
184 def _alpha(self, context): | |
185 """Get the backoff alpha value for the given context | |
186 """ | |
187 error_message = "Alphas and backoff are not defined for unigram models" | |
188 assert not self.is_unigram_model, error_message | |
189 | |
190 if context in self._backoff_alphas: | |
191 return self._backoff_alphas[context] | |
192 else: | |
193 return 1 | |
194 | |
195 def logprob(self, word, context): | |
196 """ | |
197 Evaluate the (negative) log probability of this word in this context. | |
198 | |
199 :param word: the word to get the probability of | |
200 :type word: str | |
201 :param context: the context the word is in | |
202 :type context: list(str) | |
203 """ | |
204 return -log(self.prob(word, context), 2) | |
205 | |
206 @property | |
207 def ngrams(self): | |
208 return self._ngrams | |
209 | |
210 @property | |
211 def backoff(self): | |
212 return self._backoff | |
213 | |
214 @property | |
215 def probdist(self): | |
216 return self._probdist | |
217 | |
218 def choose_random_word(self, context): | |
219 ''' | |
220 Randomly select a word that is likely to appear in this context. | |
221 | |
222 :param context: the context the word is in | |
223 :type context: list(str) | |
224 ''' | |
225 | |
226 return self.generate(1, context)[-1] | |
227 | |
228 # NB, this will always start with same word if the model | |
229 # was trained on a single text | |
230 | |
231 def generate(self, num_words, context=()): | |
232 ''' | |
233 Generate random text based on the language model. | |
234 | |
235 :param num_words: number of words to generate | |
236 :type num_words: int | |
237 :param context: initial words in generated string | |
238 :type context: list(str) | |
239 ''' | |
240 | |
241 text = list(context) | |
242 for i in range(num_words): | |
243 text.append(self._generate_one(text)) | |
244 return text | |
245 | |
246 def _generate_one(self, context): | |
247 context = (self._lpad + tuple(context))[- self._n + 1:] | |
248 if context in self: | |
249 return self[context].generate() | |
250 elif self._n > 1: | |
251 return self._backoff._generate_one(context[1:]) | |
252 else: | |
253 return '.' | |
254 | |
255 def entropy(self, text): | |
256 """ | |
257 Calculate the approximate cross-entropy of the n-gram model for a | |
258 given evaluation text. | |
259 This is the average log probability of each word in the text. | |
260 | |
261 :param text: words to use for evaluation | |
262 :type text: list(str) | |
263 """ | |
264 | |
265 e = 0.0 | |
266 text = list(self._lpad) + text + list(self._rpad) | |
267 for i in range(self._n - 1, len(text)): | |
268 context = tuple(text[i - self._n + 1:i]) | |
269 token = text[i] | |
270 e += self.logprob(token, context) | |
271 return e / float(len(text) - (self._n - 1)) | |
272 | |
273 def perplexity(self, text): | |
274 """ | |
275 Calculates the perplexity of the given text. | |
276 This is simply 2 ** cross-entropy for the text. | |
277 | |
278 :param text: words to calculate perplexity of | |
279 :type text: list(str) | |
280 """ | |
281 | |
282 return pow(2.0, self.entropy(text)) | |
283 | |
284 def __contains__(self, item): | |
285 return tuple(item) in self._probdist.freqdist | |
286 | |
287 def __getitem__(self, item): | |
288 return self._probdist[tuple(item)] | |
289 | |
290 def __repr__(self): | |
291 return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n) | |
292 | |
293 | |
294 def teardown_module(module=None): | |
295 from nltk.corpus import brown | |
296 brown._unload() | |
297 | |
298 if __name__ == "__main__": | |
299 import doctest | |
300 doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) |