Mercurial > hg > python
annotate ngram.py @ 42:59517f60826d
quiet working, -d to use ssh -v
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 04 Jan 2022 10:42:06 +0000 |
parents | fee51ab07d09 |
children |
rev | line source |
---|---|
0
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 # Natural Language Toolkit: Language Models |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # Copyright (C) 2001-2009 NLTK Project |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 # Author: Steven Bird <sb@csse.unimelb.edu.au> |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 # URL: <http://www.nltk.org/> |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 # For license information, see LICENSE.TXT |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 import random, types |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 from itertools import chain |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 from math import log |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 from nltk.probability import (ConditionalProbDist, ConditionalFreqDist, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 MLEProbDist, FreqDist) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 try: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 from nltk.util import ingrams |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 except: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 from nltkx.util import ingrams |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 from api import * |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 class NgramModel(ModelI): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 A processing interface for assigning a probability to the next word. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 def __init__(self, n, train, pad_left=False, pad_right=False, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 estimator=None, *estimator_args, **estimator_kwargs): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 Creates an ngram language model to capture patterns in n consecutive |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 words of training text. An estimator smooths the probabilities derived |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 from the text and may allow generation of ngrams not seen during |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 training. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 @param n: the order of the language model (ngram size) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 @type n: C{int} |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 @param train: the training text |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 @type train: C{list} of C{list} of C{string} |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 @param estimator: a function for generating a probability distribution |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 @type estimator: a function that takes a C{ConditionalFreqDist} and |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 returns a C{ConditionalProbDist} |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 @param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 @type pad_left: bool |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 @param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 @type pad_right: bool |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 @param estimator_args: Extra arguments for estimator. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 These arguments are usually used to specify extra |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 properties for the probability distributions of individual |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 conditions, such as the number of bins they contain. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 Note: For backward-compatibility, if no arguments are specified, the |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 number of bins in the underlying ConditionalFreqDist are passed to |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 the estimator as an argument. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 @type estimator_args: (any) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 @param estimator_kwargs: Extra keyword arguments for the estimator |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
54 @type estimator_kwargs: (any) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
55 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
56 # protection from cryptic behavior for calling programs |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
57 # that use the pre-2.0.2 interface |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
58 assert(isinstance(pad_left, bool)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
59 assert(isinstance(pad_right, bool)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
60 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
61 self._n = n |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
62 self._W = len(train) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
63 self._lpad = ('<s>',) * (n - 1) if pad_left else () |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
64 # Need _rpad even for unigrams or padded entropy will give |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
65 # wrong answer because '' will be treated as unseen... |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
66 self._rpad = ('</s>',) * (max(1,(n - 1))) if pad_right else () |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
67 self._padLen = len(self._lpad)+len(self._rpad) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
68 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
69 self._N=0 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
70 delta = 1+self._padLen-n # len(sent)+delta == ngrams in sent |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
71 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
72 if estimator is None: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
73 assert (estimator_args is None) and (estimator_kwargs is None),\ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
74 "estimator_args or _kwargs supplied, but no estimator" |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
75 estimator = lambda fdist, bins: MLEProbDist(fdist) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
76 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
77 # Given backoff, a generator isn't acceptable |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
78 if isinstance(train,types.GeneratorType): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
79 train=list(train) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
80 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
81 if n == 1: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
82 if pad_right: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
83 sents=(chain(s,self._rpad) for s in train) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
84 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
85 sents=train |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
86 fd=FreqDist() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
87 for s in sents: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
88 fd.update(s) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
89 if not estimator_args and not estimator_kwargs: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
90 self._model = estimator(fd,fd.B()) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
91 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
92 self._model = estimator(fd,fd.B(), |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
93 *estimator_args, **estimator_kwargs) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
94 self._N=fd.N() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
95 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
96 cfd = ConditionalFreqDist() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
97 self._ngrams = set() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
98 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
99 for sent in train: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
100 self._N+=len(sent)+delta |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
101 for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
102 self._ngrams.add(ngram) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
103 context = tuple(ngram[:-1]) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
104 token = ngram[-1] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
105 cfd[context][token]+=1 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
106 if not estimator_args and not estimator_kwargs: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
107 self._model = ConditionalProbDist(cfd, estimator, len(cfd)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
108 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
109 self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
110 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
111 # recursively construct the lower-order models |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
112 if n > 1: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
113 self._backoff = NgramModel(n-1, train, pad_left, pad_right, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
114 estimator, *estimator_args, **estimator_kwargs) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
115 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
116 # Code below here in this method, and the _words_following and _alpha method, are from |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
117 # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015" |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
118 self._backoff_alphas = dict() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
119 # For each condition (or context) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
120 #print cfd,cfd.conditions() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
121 for ctxt in cfd.conditions(): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
122 backoff_ctxt = ctxt[1:] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
123 backoff_total_pr = 0.0 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
124 total_observed_pr = 0.0 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
125 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
126 # this is the subset of words that we OBSERVED following |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
127 # this context. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
128 # i.e. Count(word | context) > 0 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
129 wf=list(self._words_following(ctxt, cfd)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
130 for word in self._words_following(ctxt, cfd): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
131 total_observed_pr += self.prob(word, ctxt) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
132 # we also need the total (n-1)-gram probability of |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
133 # words observed in this n-gram context |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
134 backoff_total_pr += self._backoff.prob(word, backoff_ctxt) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
135 assert (0 <= total_observed_pr <= 1),\ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
136 "sum of probs for %s out of bounds: %s"%(ctxt,total_observed_pr) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
137 # beta is the remaining probability weight after we factor out |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
138 # the probability of observed words. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
139 # As a sanity check, both total_observed_pr and backoff_total_pr |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
140 # must be GE 0, since probabilities are never negative |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
141 beta = 1.0 - total_observed_pr |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
142 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
143 # if backoff total is 1, that should mean that all samples occur in this context, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
144 # so we will never back off. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
145 # Greater than 1 is an error. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
146 assert (0 <= backoff_total_pr < 1), \ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
147 "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
148 alpha_ctxt = beta / (1.0 - backoff_total_pr) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
149 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
150 self._backoff_alphas[ctxt] = alpha_ctxt |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
151 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
152 def _words_following(self, context, cond_freq_dist): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
153 return cond_freq_dist[context].iterkeys() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
154 # below from http://www.nltk.org/_modules/nltk/model/ngram.html, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
155 # depends on new CFD??? |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
156 #for ctxt, word in cond_freq_dist.iterkeys(): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
157 # if ctxt == context: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
158 # yield word |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
159 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
160 def prob(self, word, context, verbose=False): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
161 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
162 Evaluate the probability of this word in this context |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
163 using Katz Backoff. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
164 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
165 assert(isinstance(word,types.StringTypes)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
166 context = tuple(context) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
167 if self._n==1: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
168 if not(self._model.SUM_TO_ONE): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
169 # Smoothing models should do the right thing for unigrams |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
170 # even if they're 'absent' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
171 return self._model.prob(word) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
172 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
173 try: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
174 return self._model.prob(word) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
175 except: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
176 raise RuntimeError("No probability mass assigned" |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
177 "to unigram %s" % (word)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
178 if context + (word,) in self._ngrams: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
179 return self[context].prob(word) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
180 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
181 alpha=self._alpha(context) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
182 if alpha>0: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
183 if verbose: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
184 print "backing off for %s"%(context+(word,),) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
185 return alpha * self._backoff.prob(word, context[1:],verbose) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
186 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
187 if verbose: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
188 print "no backoff for %s as model doesn't do any smoothing"%word |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
189 return alpha |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
190 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
191 def _alpha(self, context,verbose=False): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
192 """Get the backoff alpha value for the given context |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
193 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
194 error_message = "Alphas and backoff are not defined for unigram models" |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
195 assert (not self._n == 1), error_message |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
196 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
197 if context in self._backoff_alphas: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
198 res = self._backoff_alphas[context] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
199 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
200 res = 1 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
201 if verbose: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
202 print " alpha: %s = %s"%(context,res) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
203 return res |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
204 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
205 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
206 def logprob(self, word, context,verbose=False): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
207 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
208 Evaluate the (negative) log probability of this word in this context. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
209 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
210 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
211 return -log(self.prob(word, context,verbose), 2) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
212 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
213 # NB, this will always start with same word since model |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
214 # is trained on a single text |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
215 def generate(self, num_words, context=()): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
216 '''Generate random text based on the language model.''' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
217 text = list(context) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
218 for i in range(num_words): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
219 text.append(self._generate_one(text)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
220 return text |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
221 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
222 def _generate_one(self, context): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
223 context = (self._prefix + tuple(context))[-self._n+1:] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
224 # print "Context (%d): <%s>" % (self._n, ','.join(context)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
225 if context in self: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
226 return self[context].generate() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
227 elif self._n > 1: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
228 return self._backoff._generate_one(context[1:]) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
229 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
230 return '.' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
231 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
232 def entropy(self, text, pad_left=False, pad_right=False, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
233 verbose=False, perItem=False): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
234 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
235 Evaluate the total entropy of a text with respect to the model. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
236 This is the sum of the log probability of each word in the message. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
237 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
238 # This version takes account of padding for greater accuracy |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
239 e = 0.0 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
240 for ngram in ngrams(chain(self._lpad, text, self._rpad), self._n): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
241 context = tuple(ngram[:-1]) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
242 token = ngram[-1] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
243 cost=self.logprob(token, context, verbose) # _negative_ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
244 # log2 prob == cost! |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
245 if verbose: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
246 print "p(%s|%s) = [%s-gram] %7f"%(token,context,self._n,2**-cost) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
247 e += cost |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
248 if perItem: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
249 return e/((len(text)+self._padLen)-(self._n - 1)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
250 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
251 return e |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
252 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
253 def dump(self, file, logBase=None, precision=7): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
254 """Dump this model in SRILM/ARPA/Doug Paul format |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
255 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
256 Use logBase=10 and the default precision to get something comparable |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
257 to SRILM ngram-model -lm output |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
258 @param file to dump to |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
259 @type file file |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
260 @param logBase If not None, output logBases to the specified base |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
261 @type logBase int|None""" |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
262 file.write('\n\\data\\\n') |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
263 self._writeLens(file) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
264 self._writeModels(file,logBase,precision,None) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
265 file.write('\\end\\\n') |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
266 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
267 def _writeLens(self,file): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
268 if self._n>1: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
269 self._backoff._writeLens(file) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
270 file.write('ngram %s=%s\n'%(self._n, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
271 sum(len(self._model[c].samples())\ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
272 for c in self._model.keys()))) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
273 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
274 file.write('ngram 1=%s\n'%len(self._model.samples())) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
275 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
276 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
277 def _writeModels(self,file,logBase,precision,alphas): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
278 if self._n>1: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
279 self._backoff._writeModels(file,logBase,precision,self._backoff_alphas) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
280 file.write('\n\\%s-grams:\n'%self._n) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
281 if self._n==1: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
282 self._writeProbs(self._model,file,logBase,precision,(),alphas) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
283 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
284 for c in sorted(self._model.conditions()): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
285 self._writeProbs(self._model[c],file,logBase,precision, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
286 c,alphas) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
287 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
288 def _writeProbs(self,pd,file,logBase,precision,ctxt,alphas): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
289 if self._n==1: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
290 for k in sorted(pd.samples()+['<unk>','<s>']): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
291 if k=='<s>': |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
292 file.write('-99') |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
293 elif k=='<unk>': |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
294 _writeProb(file,logBase,precision,1-pd.discount()) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
295 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
296 _writeProb(file,logBase,precision,pd.prob(k)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
297 file.write('\t%s'%k) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
298 if k not in ('</s>','<unk>'): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
299 file.write('\t') |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
300 _writeProb(file,logBase,precision,alphas[ctxt+(k,)]) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
301 file.write('\n') |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
302 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
303 ctxtString=' '.join(ctxt) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
304 for k in sorted(pd.samples()): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
305 _writeProb(file,logBase,precision,pd.prob(k)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
306 file.write('\t%s %s'%(ctxtString,k)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
307 if alphas is not None: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
308 file.write('\t') |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
309 _writeProb(file,logBase,precision,alphas[ctxt+(k,)]) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
310 file.write('\n') |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
311 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
312 def __contains__(self, item): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
313 try: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
314 return item in self._model |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
315 except: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
316 try: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
317 # hack if model is an MLEProbDist, more efficient |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
318 return item in self._model._freqdist |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
319 except: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
320 return item in self._model.samples() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
321 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
322 def __getitem__(self, item): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
323 return self._model[item] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
324 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
325 def __repr__(self): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
326 return '<NgramModel with %d %d-grams>' % (self._N, self._n) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
327 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
328 def _writeProb(file,logBase,precision,p): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
329 file.write('%.*g'%(precision, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
330 p if logBase is None else log(p,logBase))) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
331 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
332 def demo(): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
333 from nltk.corpus import brown |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
334 from nltk.probability import LidstoneProbDist, WittenBellProbDist |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
335 estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
336 # estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
337 lm = NgramModel(3, brown.words(categories='news'), estimator) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
338 print lm |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
339 # print lm.entropy(sent) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
340 text = lm.generate(100) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
341 import textwrap |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
342 print '\n'.join(textwrap.wrap(' '.join(text))) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
343 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
344 if __name__ == '__main__': |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
345 demo() |