annotate ngram.py @ 42:59517f60826d

quiet working, -d to use ssh -v
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 04 Jan 2022 10:42:06 +0000
parents fee51ab07d09
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 # Natural Language Toolkit: Language Models
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 #
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Copyright (C) 2001-2009 NLTK Project
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 # Author: Steven Bird <sb@csse.unimelb.edu.au>
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 # URL: <http://www.nltk.org/>
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 # For license information, see LICENSE.TXT
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 import random, types
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 from itertools import chain
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 from math import log
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 from nltk.probability import (ConditionalProbDist, ConditionalFreqDist,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 MLEProbDist, FreqDist)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 try:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 from nltk.util import ingrams
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 except:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 from nltkx.util import ingrams
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 from api import *
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 class NgramModel(ModelI):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 A processing interface for assigning a probability to the next word.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 def __init__(self, n, train, pad_left=False, pad_right=False,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 estimator=None, *estimator_args, **estimator_kwargs):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 Creates an ngram language model to capture patterns in n consecutive
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 words of training text. An estimator smooths the probabilities derived
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 from the text and may allow generation of ngrams not seen during
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 training.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 @param n: the order of the language model (ngram size)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 @type n: C{int}
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 @param train: the training text
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 @type train: C{list} of C{list} of C{string}
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 @param estimator: a function for generating a probability distribution
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 @type estimator: a function that takes a C{ConditionalFreqDist} and
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 returns a C{ConditionalProbDist}
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 @param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 @type pad_left: bool
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 @param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 @type pad_right: bool
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 @param estimator_args: Extra arguments for estimator.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 These arguments are usually used to specify extra
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 properties for the probability distributions of individual
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 conditions, such as the number of bins they contain.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 Note: For backward-compatibility, if no arguments are specified, the
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 number of bins in the underlying ConditionalFreqDist are passed to
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 the estimator as an argument.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 @type estimator_args: (any)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 @param estimator_kwargs: Extra keyword arguments for the estimator
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 @type estimator_kwargs: (any)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
56 # protection from cryptic behavior for calling programs
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57 # that use the pre-2.0.2 interface
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 assert(isinstance(pad_left, bool))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 assert(isinstance(pad_right, bool))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
60
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
61 self._n = n
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
62 self._W = len(train)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
63 self._lpad = ('<s>',) * (n - 1) if pad_left else ()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
64 # Need _rpad even for unigrams or padded entropy will give
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
65 # wrong answer because '' will be treated as unseen...
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
66 self._rpad = ('</s>',) * (max(1,(n - 1))) if pad_right else ()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
67 self._padLen = len(self._lpad)+len(self._rpad)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
68
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
69 self._N=0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
70 delta = 1+self._padLen-n # len(sent)+delta == ngrams in sent
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
71
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
72 if estimator is None:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
73 assert (estimator_args is None) and (estimator_kwargs is None),\
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
74 "estimator_args or _kwargs supplied, but no estimator"
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
75 estimator = lambda fdist, bins: MLEProbDist(fdist)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
76
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
77 # Given backoff, a generator isn't acceptable
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
78 if isinstance(train,types.GeneratorType):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
79 train=list(train)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
80
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
81 if n == 1:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
82 if pad_right:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
83 sents=(chain(s,self._rpad) for s in train)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
84 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
85 sents=train
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
86 fd=FreqDist()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
87 for s in sents:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
88 fd.update(s)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
89 if not estimator_args and not estimator_kwargs:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
90 self._model = estimator(fd,fd.B())
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
91 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
92 self._model = estimator(fd,fd.B(),
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
93 *estimator_args, **estimator_kwargs)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
94 self._N=fd.N()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
95 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
96 cfd = ConditionalFreqDist()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
97 self._ngrams = set()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
98
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
99 for sent in train:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
100 self._N+=len(sent)+delta
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
101 for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
102 self._ngrams.add(ngram)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
103 context = tuple(ngram[:-1])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
104 token = ngram[-1]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
105 cfd[context][token]+=1
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
106 if not estimator_args and not estimator_kwargs:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
107 self._model = ConditionalProbDist(cfd, estimator, len(cfd))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
108 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
109 self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
110
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
111 # recursively construct the lower-order models
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
112 if n > 1:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
113 self._backoff = NgramModel(n-1, train, pad_left, pad_right,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
114 estimator, *estimator_args, **estimator_kwargs)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
115
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
116 # Code below here in this method, and the _words_following and _alpha method, are from
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
117 # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015"
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
118 self._backoff_alphas = dict()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
119 # For each condition (or context)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
120 #print cfd,cfd.conditions()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
121 for ctxt in cfd.conditions():
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
122 backoff_ctxt = ctxt[1:]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
123 backoff_total_pr = 0.0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
124 total_observed_pr = 0.0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
125
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
126 # this is the subset of words that we OBSERVED following
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
127 # this context.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
128 # i.e. Count(word | context) > 0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
129 wf=list(self._words_following(ctxt, cfd))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
130 for word in self._words_following(ctxt, cfd):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
131 total_observed_pr += self.prob(word, ctxt)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
132 # we also need the total (n-1)-gram probability of
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
133 # words observed in this n-gram context
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
134 backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
135 assert (0 <= total_observed_pr <= 1),\
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
136 "sum of probs for %s out of bounds: %s"%(ctxt,total_observed_pr)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
137 # beta is the remaining probability weight after we factor out
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
138 # the probability of observed words.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
139 # As a sanity check, both total_observed_pr and backoff_total_pr
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
140 # must be GE 0, since probabilities are never negative
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
141 beta = 1.0 - total_observed_pr
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
142
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
143 # if backoff total is 1, that should mean that all samples occur in this context,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
144 # so we will never back off.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
145 # Greater than 1 is an error.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
146 assert (0 <= backoff_total_pr < 1), \
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
147 "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
148 alpha_ctxt = beta / (1.0 - backoff_total_pr)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
149
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
150 self._backoff_alphas[ctxt] = alpha_ctxt
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
151
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
152 def _words_following(self, context, cond_freq_dist):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
153 return cond_freq_dist[context].iterkeys()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
154 # below from http://www.nltk.org/_modules/nltk/model/ngram.html,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
155 # depends on new CFD???
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
156 #for ctxt, word in cond_freq_dist.iterkeys():
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
157 # if ctxt == context:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
158 # yield word
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
159
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
160 def prob(self, word, context, verbose=False):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
161 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
162 Evaluate the probability of this word in this context
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
163 using Katz Backoff.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
164 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
165 assert(isinstance(word,types.StringTypes))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
166 context = tuple(context)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
167 if self._n==1:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
168 if not(self._model.SUM_TO_ONE):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
169 # Smoothing models should do the right thing for unigrams
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
170 # even if they're 'absent'
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
171 return self._model.prob(word)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
172 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
173 try:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
174 return self._model.prob(word)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
175 except:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
176 raise RuntimeError("No probability mass assigned"
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
177 "to unigram %s" % (word))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
178 if context + (word,) in self._ngrams:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
179 return self[context].prob(word)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
180 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
181 alpha=self._alpha(context)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
182 if alpha>0:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
183 if verbose:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
184 print "backing off for %s"%(context+(word,),)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
185 return alpha * self._backoff.prob(word, context[1:],verbose)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
186 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
187 if verbose:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
188 print "no backoff for %s as model doesn't do any smoothing"%word
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
189 return alpha
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
190
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
191 def _alpha(self, context,verbose=False):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
192 """Get the backoff alpha value for the given context
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
193 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
194 error_message = "Alphas and backoff are not defined for unigram models"
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
195 assert (not self._n == 1), error_message
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
196
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
197 if context in self._backoff_alphas:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
198 res = self._backoff_alphas[context]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
199 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
200 res = 1
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
201 if verbose:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
202 print " alpha: %s = %s"%(context,res)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
203 return res
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
204
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
205
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
206 def logprob(self, word, context,verbose=False):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
207 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
208 Evaluate the (negative) log probability of this word in this context.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
209 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
210
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
211 return -log(self.prob(word, context,verbose), 2)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
212
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
213 # NB, this will always start with same word since model
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
214 # is trained on a single text
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
215 def generate(self, num_words, context=()):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
216 '''Generate random text based on the language model.'''
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
217 text = list(context)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
218 for i in range(num_words):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
219 text.append(self._generate_one(text))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
220 return text
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
221
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
222 def _generate_one(self, context):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
223 context = (self._prefix + tuple(context))[-self._n+1:]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
224 # print "Context (%d): <%s>" % (self._n, ','.join(context))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
225 if context in self:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
226 return self[context].generate()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
227 elif self._n > 1:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
228 return self._backoff._generate_one(context[1:])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
229 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
230 return '.'
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
231
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
232 def entropy(self, text, pad_left=False, pad_right=False,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
233 verbose=False, perItem=False):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
234 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
235 Evaluate the total entropy of a text with respect to the model.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
236 This is the sum of the log probability of each word in the message.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
237 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
238 # This version takes account of padding for greater accuracy
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
239 e = 0.0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
240 for ngram in ngrams(chain(self._lpad, text, self._rpad), self._n):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
241 context = tuple(ngram[:-1])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
242 token = ngram[-1]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
243 cost=self.logprob(token, context, verbose) # _negative_
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
244 # log2 prob == cost!
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
245 if verbose:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
246 print "p(%s|%s) = [%s-gram] %7f"%(token,context,self._n,2**-cost)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
247 e += cost
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
248 if perItem:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
249 return e/((len(text)+self._padLen)-(self._n - 1))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
250 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
251 return e
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
252
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
253 def dump(self, file, logBase=None, precision=7):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
254 """Dump this model in SRILM/ARPA/Doug Paul format
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
255
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
256 Use logBase=10 and the default precision to get something comparable
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
257 to SRILM ngram-model -lm output
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
258 @param file to dump to
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
259 @type file file
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
260 @param logBase If not None, output logBases to the specified base
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
261 @type logBase int|None"""
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
262 file.write('\n\\data\\\n')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
263 self._writeLens(file)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
264 self._writeModels(file,logBase,precision,None)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
265 file.write('\\end\\\n')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
266
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
267 def _writeLens(self,file):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
268 if self._n>1:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
269 self._backoff._writeLens(file)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
270 file.write('ngram %s=%s\n'%(self._n,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
271 sum(len(self._model[c].samples())\
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
272 for c in self._model.keys())))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
273 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
274 file.write('ngram 1=%s\n'%len(self._model.samples()))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
275
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
276
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
277 def _writeModels(self,file,logBase,precision,alphas):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
278 if self._n>1:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
279 self._backoff._writeModels(file,logBase,precision,self._backoff_alphas)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
280 file.write('\n\\%s-grams:\n'%self._n)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
281 if self._n==1:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
282 self._writeProbs(self._model,file,logBase,precision,(),alphas)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
283 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
284 for c in sorted(self._model.conditions()):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
285 self._writeProbs(self._model[c],file,logBase,precision,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
286 c,alphas)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
287
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
288 def _writeProbs(self,pd,file,logBase,precision,ctxt,alphas):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
289 if self._n==1:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
290 for k in sorted(pd.samples()+['<unk>','<s>']):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
291 if k=='<s>':
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
292 file.write('-99')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
293 elif k=='<unk>':
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
294 _writeProb(file,logBase,precision,1-pd.discount())
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
295 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
296 _writeProb(file,logBase,precision,pd.prob(k))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
297 file.write('\t%s'%k)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
298 if k not in ('</s>','<unk>'):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
299 file.write('\t')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
300 _writeProb(file,logBase,precision,alphas[ctxt+(k,)])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
301 file.write('\n')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
302 else:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
303 ctxtString=' '.join(ctxt)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
304 for k in sorted(pd.samples()):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
305 _writeProb(file,logBase,precision,pd.prob(k))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
306 file.write('\t%s %s'%(ctxtString,k))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
307 if alphas is not None:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
308 file.write('\t')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
309 _writeProb(file,logBase,precision,alphas[ctxt+(k,)])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
310 file.write('\n')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
311
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
312 def __contains__(self, item):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
313 try:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
314 return item in self._model
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
315 except:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
316 try:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
317 # hack if model is an MLEProbDist, more efficient
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
318 return item in self._model._freqdist
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
319 except:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
320 return item in self._model.samples()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
321
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
322 def __getitem__(self, item):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
323 return self._model[item]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
324
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
325 def __repr__(self):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
326 return '<NgramModel with %d %d-grams>' % (self._N, self._n)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
327
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
328 def _writeProb(file,logBase,precision,p):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
329 file.write('%.*g'%(precision,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
330 p if logBase is None else log(p,logBase)))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
331
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
332 def demo():
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
333 from nltk.corpus import brown
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
334 from nltk.probability import LidstoneProbDist, WittenBellProbDist
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
335 estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
336 # estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
337 lm = NgramModel(3, brown.words(categories='news'), estimator)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
338 print lm
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
339 # print lm.entropy(sent)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
340 text = lm.generate(100)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
341 import textwrap
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
342 print '\n'.join(textwrap.wrap(' '.join(text)))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
343
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
344 if __name__ == '__main__':
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
345 demo()