annotate ngram_3.0.py @ 14:1e961cf10f88

10a is losing at 1R
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sat, 21 Mar 2020 21:15:52 +0000
parents fee51ab07d09
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 # Natural Language Toolkit: Language Models
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 #
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Copyright (C) 2001-2014 NLTK Project
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 # Authors: Steven Bird <stevenbird1@gmail.com>
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 # Daniel Blanchard <dblanchard@ets.org>
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 # Ilia Kurenkov <ilia.kurenkov@gmail.com>
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 # URL: <http://nltk.org/>
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 # For license information, see LICENSE.TXT
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 ######## Copied from http://www.nltk.org/_modules/nltk/model/ngram.html 2017-01-14
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 ######## Not actually part of 3.0 release
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 ######## "© Copyright 2015, NLTK Project. Last updated on Feb 26, 2015. Created using Sphinx 1.2.3"
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 from __future__ import unicode_literals
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 from itertools import chain
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 from math import log
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 from nltk.probability import (FreqDist,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 ConditionalProbDist,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 ConditionalFreqDist,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 LidstoneProbDist)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 from nltk.util import ngrams
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 from nltk.model.api import ModelI
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 from nltk import compat
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 def _estimator(fdist, *estimator_args, **estimator_kwargs):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 Default estimator function using a SimpleGoodTuringProbDist.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 # can't be an instance method of NgramModel as they
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 # can't be pickled either.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 @compat.python_2_unicode_compatible
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 class NgramModel(ModelI):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 A processing interface for assigning a probability to the next word.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 def __init__(self, n, train, pad_left=True, pad_right=False,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 estimator=None, *estimator_args, **estimator_kwargs):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 Create an ngram language model to capture patterns in n consecutive
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 words of training text. An estimator smooths the probabilities derived
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 from the text and may allow generation of ngrams not seen during
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 training.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 >>> from nltk.corpus import brown
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 >>> from nltk.probability import LidstoneProbDist
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 >>> lm
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55 <NgramModel with 91603 3-grams>
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
56 >>> lm._backoff
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57 <NgramModel with 62888 2-grams>
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
60 ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
61 ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
62 ... # doctest: +ELLIPSIS
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
63 0.5776...
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
64
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
65 :param n: the order of the language model (ngram size)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
66 :type n: int
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
67 :param train: the training text
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
68 :type train: list(str) or list(list(str))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
69 :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
70 :type pad_left: bool
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
71 :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
72 :type pad_right: bool
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
73 :param estimator: a function for generating a probability distribution
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
74 :type estimator: a function that takes a ConditionalFreqDist and
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
75 returns a ConditionalProbDist
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
76 :param estimator_args: Extra arguments for estimator.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
77 These arguments are usually used to specify extra
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
78 properties for the probability distributions of individual
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
79 conditions, such as the number of bins they contain.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
80 Note: For backward-compatibility, if no arguments are specified, the
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
81 number of bins in the underlying ConditionalFreqDist are passed to
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
82 the estimator as an argument.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
83 :type estimator_args: (any)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
84 :param estimator_kwargs: Extra keyword arguments for the estimator
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
85 :type estimator_kwargs: (any)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
86 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
87
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
88 # protection from cryptic behavior for calling programs
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
89 # that use the pre-2.0.2 interface
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
90 assert(isinstance(pad_left, bool))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
91 assert(isinstance(pad_right, bool))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
92
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
93 # make sure n is greater than zero, otherwise print it
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
94 assert (n > 0), n
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
95
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
96 # For explicitness save the check whether this is a unigram model
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
97 self.is_unigram_model = (n == 1)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
98 # save the ngram order number
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
99 self._n = n
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
100 # save left and right padding
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
101 self._lpad = ('',) * (n - 1) if pad_left else ()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
102 self._rpad = ('',) * (n - 1) if pad_right else ()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
103
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
104 if estimator is None:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
105 estimator = _estimator
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
106
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
107 cfd = ConditionalFreqDist()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
108
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
109 # set read-only ngrams set (see property declaration below to reconfigure)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
110 self._ngrams = set()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
111
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
112 # If given a list of strings instead of a list of lists, create enclosing list
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
113 if (train is not None) and isinstance(train[0], compat.string_types):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
114 train = [train]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
115
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
116 for sent in train:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
117 raw_ngrams = ngrams(sent, n, pad_left, pad_right, pad_symbol='')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
118 for ngram in raw_ngrams:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
119 self._ngrams.add(ngram)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
120 context = tuple(ngram[:-1])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
121 token = ngram[-1]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
122 cfd[(context, token)] += 1
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
123
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
124 self._probdist = estimator(cfd, *estimator_args, **estimator_kwargs)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
125
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
126 # recursively construct the lower-order models
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
127 if not self.is_unigram_model:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
128 self._backoff = NgramModel(n-1, train,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
129 pad_left, pad_right,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
130 estimator,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
131 *estimator_args,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
132 **estimator_kwargs)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
133
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
134 self._backoff_alphas = dict()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
135 # For each condition (or context)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
136 for ctxt in cfd.conditions():
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
137 backoff_ctxt = ctxt[1:]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
138 backoff_total_pr = 0.0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
139 total_observed_pr = 0.0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
140
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
141 # this is the subset of words that we OBSERVED following
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
142 # this context.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
143 # i.e. Count(word | context) > 0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
144 for word in self._words_following(ctxt, cfd):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
145 total_observed_pr += self.prob(word, ctxt)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
146 # we also need the total (n-1)-gram probability of
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
147 # words observed in this n-gram context
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
148 backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
149
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
150 assert (0 <= total_observed_pr <= 1), total_observed_pr
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
151 # beta is the remaining probability weight after we factor out
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
152 # the probability of observed words.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
153 # As a sanity check, both total_observed_pr and backoff_total_pr
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
154 # must be GE 0, since probabilities are never negative
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
155 beta = 1.0 - total_observed_pr
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
156
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
157 # backoff total has to be less than one, otherwise we get
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
158 # an error when we try subtracting it from 1 in the denominator
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
159 assert (0 <= backoff_total_pr < 1), backoff_total_pr
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
160 alpha_ctxt = beta / (1.0 - backoff_total_pr)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
161
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
162 self._backoff_alphas[ctxt] = alpha_ctxt
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
163
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
164 def _words_following(self, context, cond_freq_dist):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
165 for ctxt, word in cond_freq_dist.iterkeys():
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
166 if ctxt == context:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
167 yield word
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
168
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
169 def prob(self, word, context):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
170 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
171 Evaluate the probability of this word in this context using Katz Backoff.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
172
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
173 :param word: the word to get the probability of
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
174 :type word: str
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
175 :param context: the context the word is in
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
176 :type context: list(str)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
177 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
178 context = tuple(context)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
179 if (context + (word,) in self._ngrams) or (self.is_unigram_model):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
180 return self._probdist.prob((context, word))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
181 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
182 return self._alpha(context) * self._backoff.prob(word, context[1:])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
183
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
184 def _alpha(self, context):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
185 """Get the backoff alpha value for the given context
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
186 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
187 error_message = "Alphas and backoff are not defined for unigram models"
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
188 assert not self.is_unigram_model, error_message
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
189
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
190 if context in self._backoff_alphas:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
191 return self._backoff_alphas[context]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
192 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
193 return 1
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
194
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
195 def logprob(self, word, context):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
196 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
197 Evaluate the (negative) log probability of this word in this context.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
198
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
199 :param word: the word to get the probability of
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
200 :type word: str
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
201 :param context: the context the word is in
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
202 :type context: list(str)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
203 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
204 return -log(self.prob(word, context), 2)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
205
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
206 @property
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
207 def ngrams(self):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
208 return self._ngrams
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
209
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
210 @property
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
211 def backoff(self):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
212 return self._backoff
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
213
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
214 @property
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
215 def probdist(self):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
216 return self._probdist
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
217
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
218 def choose_random_word(self, context):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
219 '''
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
220 Randomly select a word that is likely to appear in this context.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
221
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
222 :param context: the context the word is in
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
223 :type context: list(str)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
224 '''
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
225
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
226 return self.generate(1, context)[-1]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
227
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
228 # NB, this will always start with same word if the model
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
229 # was trained on a single text
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
230
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
231 def generate(self, num_words, context=()):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
232 '''
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
233 Generate random text based on the language model.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
234
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
235 :param num_words: number of words to generate
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
236 :type num_words: int
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
237 :param context: initial words in generated string
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
238 :type context: list(str)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
239 '''
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
240
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
241 text = list(context)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
242 for i in range(num_words):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
243 text.append(self._generate_one(text))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
244 return text
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
245
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
246 def _generate_one(self, context):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
247 context = (self._lpad + tuple(context))[- self._n + 1:]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
248 if context in self:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
249 return self[context].generate()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
250 elif self._n > 1:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
251 return self._backoff._generate_one(context[1:])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
252 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
253 return '.'
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
254
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
255 def entropy(self, text):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
256 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
257 Calculate the approximate cross-entropy of the n-gram model for a
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
258 given evaluation text.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
259 This is the average log probability of each word in the text.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
260
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
261 :param text: words to use for evaluation
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
262 :type text: list(str)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
263 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
264
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
265 e = 0.0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
266 text = list(self._lpad) + text + list(self._rpad)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
267 for i in range(self._n - 1, len(text)):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
268 context = tuple(text[i - self._n + 1:i])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
269 token = text[i]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
270 e += self.logprob(token, context)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
271 return e / float(len(text) - (self._n - 1))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
272
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
273 def perplexity(self, text):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
274 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
275 Calculates the perplexity of the given text.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
276 This is simply 2 ** cross-entropy for the text.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
277
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
278 :param text: words to calculate perplexity of
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
279 :type text: list(str)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
280 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
281
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
282 return pow(2.0, self.entropy(text))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
283
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
284 def __contains__(self, item):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
285 return tuple(item) in self._probdist.freqdist
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
286
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
287 def __getitem__(self, item):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
288 return self._probdist[tuple(item)]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
289
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
290 def __repr__(self):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
291 return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
292
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
293
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
294 def teardown_module(module=None):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
295 from nltk.corpus import brown
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
296 brown._unload()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
297
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
298 if __name__ == "__main__":
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
299 import doctest
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
300 doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)