Mercurial > hg > python
annotate ngram_3.0.py @ 17:6c9e371b0325
doesn't break, what's there is good, but stuck part-way in 10a
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 25 Mar 2020 19:21:38 +0000 |
parents | fee51ab07d09 |
children |
rev | line source |
---|---|
0
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 # Natural Language Toolkit: Language Models |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # Copyright (C) 2001-2014 NLTK Project |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 # Authors: Steven Bird <stevenbird1@gmail.com> |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 # Daniel Blanchard <dblanchard@ets.org> |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 # Ilia Kurenkov <ilia.kurenkov@gmail.com> |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 # URL: <http://nltk.org/> |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 # For license information, see LICENSE.TXT |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 ######## Copied from http://www.nltk.org/_modules/nltk/model/ngram.html 2017-01-14 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 ######## Not actually part of 3.0 release |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 ######## "© Copyright 2015, NLTK Project. Last updated on Feb 26, 2015. Created using Sphinx 1.2.3" |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 from __future__ import unicode_literals |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 from itertools import chain |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 from math import log |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 from nltk.probability import (FreqDist, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 ConditionalProbDist, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 ConditionalFreqDist, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 LidstoneProbDist) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 from nltk.util import ngrams |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 from nltk.model.api import ModelI |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 from nltk import compat |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 def _estimator(fdist, *estimator_args, **estimator_kwargs): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 Default estimator function using a SimpleGoodTuringProbDist. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 # can't be an instance method of NgramModel as they |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 # can't be pickled either. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 @compat.python_2_unicode_compatible |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 class NgramModel(ModelI): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 A processing interface for assigning a probability to the next word. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 def __init__(self, n, train, pad_left=True, pad_right=False, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 estimator=None, *estimator_args, **estimator_kwargs): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 Create an ngram language model to capture patterns in n consecutive |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 words of training text. An estimator smooths the probabilities derived |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 from the text and may allow generation of ngrams not seen during |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 training. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 >>> from nltk.corpus import brown |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 >>> from nltk.probability import LidstoneProbDist |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
54 >>> lm |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
55 <NgramModel with 91603 3-grams> |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
56 >>> lm._backoff |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
57 <NgramModel with 62888 2-grams> |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
58 >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
59 ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
60 ... 'primary', 'election', 'produced', '``', 'no', 'evidence', |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
61 ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.']) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
62 ... # doctest: +ELLIPSIS |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
63 0.5776... |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
64 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
65 :param n: the order of the language model (ngram size) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
66 :type n: int |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
67 :param train: the training text |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
68 :type train: list(str) or list(list(str)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
69 :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
70 :type pad_left: bool |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
71 :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
72 :type pad_right: bool |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
73 :param estimator: a function for generating a probability distribution |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
74 :type estimator: a function that takes a ConditionalFreqDist and |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
75 returns a ConditionalProbDist |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
76 :param estimator_args: Extra arguments for estimator. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
77 These arguments are usually used to specify extra |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
78 properties for the probability distributions of individual |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
79 conditions, such as the number of bins they contain. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
80 Note: For backward-compatibility, if no arguments are specified, the |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
81 number of bins in the underlying ConditionalFreqDist are passed to |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
82 the estimator as an argument. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
83 :type estimator_args: (any) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
84 :param estimator_kwargs: Extra keyword arguments for the estimator |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
85 :type estimator_kwargs: (any) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
86 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
87 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
88 # protection from cryptic behavior for calling programs |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
89 # that use the pre-2.0.2 interface |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
90 assert(isinstance(pad_left, bool)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
91 assert(isinstance(pad_right, bool)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
92 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
93 # make sure n is greater than zero, otherwise print it |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
94 assert (n > 0), n |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
95 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
96 # For explicitness save the check whether this is a unigram model |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
97 self.is_unigram_model = (n == 1) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
98 # save the ngram order number |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
99 self._n = n |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
100 # save left and right padding |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
101 self._lpad = ('',) * (n - 1) if pad_left else () |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
102 self._rpad = ('',) * (n - 1) if pad_right else () |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
103 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
104 if estimator is None: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
105 estimator = _estimator |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
106 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
107 cfd = ConditionalFreqDist() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
108 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
109 # set read-only ngrams set (see property declaration below to reconfigure) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
110 self._ngrams = set() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
111 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
112 # If given a list of strings instead of a list of lists, create enclosing list |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
113 if (train is not None) and isinstance(train[0], compat.string_types): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
114 train = [train] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
115 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
116 for sent in train: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
117 raw_ngrams = ngrams(sent, n, pad_left, pad_right, pad_symbol='') |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
118 for ngram in raw_ngrams: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
119 self._ngrams.add(ngram) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
120 context = tuple(ngram[:-1]) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
121 token = ngram[-1] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
122 cfd[(context, token)] += 1 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
123 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
124 self._probdist = estimator(cfd, *estimator_args, **estimator_kwargs) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
125 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
126 # recursively construct the lower-order models |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
127 if not self.is_unigram_model: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
128 self._backoff = NgramModel(n-1, train, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
129 pad_left, pad_right, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
130 estimator, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
131 *estimator_args, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
132 **estimator_kwargs) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
133 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
134 self._backoff_alphas = dict() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
135 # For each condition (or context) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
136 for ctxt in cfd.conditions(): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
137 backoff_ctxt = ctxt[1:] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
138 backoff_total_pr = 0.0 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
139 total_observed_pr = 0.0 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
140 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
141 # this is the subset of words that we OBSERVED following |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
142 # this context. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
143 # i.e. Count(word | context) > 0 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
144 for word in self._words_following(ctxt, cfd): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
145 total_observed_pr += self.prob(word, ctxt) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
146 # we also need the total (n-1)-gram probability of |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
147 # words observed in this n-gram context |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
148 backoff_total_pr += self._backoff.prob(word, backoff_ctxt) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
149 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
150 assert (0 <= total_observed_pr <= 1), total_observed_pr |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
151 # beta is the remaining probability weight after we factor out |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
152 # the probability of observed words. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
153 # As a sanity check, both total_observed_pr and backoff_total_pr |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
154 # must be GE 0, since probabilities are never negative |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
155 beta = 1.0 - total_observed_pr |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
156 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
157 # backoff total has to be less than one, otherwise we get |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
158 # an error when we try subtracting it from 1 in the denominator |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
159 assert (0 <= backoff_total_pr < 1), backoff_total_pr |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
160 alpha_ctxt = beta / (1.0 - backoff_total_pr) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
161 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
162 self._backoff_alphas[ctxt] = alpha_ctxt |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
163 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
164 def _words_following(self, context, cond_freq_dist): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
165 for ctxt, word in cond_freq_dist.iterkeys(): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
166 if ctxt == context: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
167 yield word |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
168 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
169 def prob(self, word, context): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
170 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
171 Evaluate the probability of this word in this context using Katz Backoff. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
172 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
173 :param word: the word to get the probability of |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
174 :type word: str |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
175 :param context: the context the word is in |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
176 :type context: list(str) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
177 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
178 context = tuple(context) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
179 if (context + (word,) in self._ngrams) or (self.is_unigram_model): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
180 return self._probdist.prob((context, word)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
181 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
182 return self._alpha(context) * self._backoff.prob(word, context[1:]) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
183 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
184 def _alpha(self, context): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
185 """Get the backoff alpha value for the given context |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
186 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
187 error_message = "Alphas and backoff are not defined for unigram models" |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
188 assert not self.is_unigram_model, error_message |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
189 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
190 if context in self._backoff_alphas: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
191 return self._backoff_alphas[context] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
192 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
193 return 1 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
194 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
195 def logprob(self, word, context): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
196 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
197 Evaluate the (negative) log probability of this word in this context. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
198 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
199 :param word: the word to get the probability of |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
200 :type word: str |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
201 :param context: the context the word is in |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
202 :type context: list(str) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
203 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
204 return -log(self.prob(word, context), 2) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
205 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
206 @property |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
207 def ngrams(self): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
208 return self._ngrams |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
209 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
210 @property |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
211 def backoff(self): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
212 return self._backoff |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
213 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
214 @property |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
215 def probdist(self): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
216 return self._probdist |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
217 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
218 def choose_random_word(self, context): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
219 ''' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
220 Randomly select a word that is likely to appear in this context. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
221 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
222 :param context: the context the word is in |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
223 :type context: list(str) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
224 ''' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
225 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
226 return self.generate(1, context)[-1] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
227 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
228 # NB, this will always start with same word if the model |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
229 # was trained on a single text |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
230 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
231 def generate(self, num_words, context=()): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
232 ''' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
233 Generate random text based on the language model. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
234 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
235 :param num_words: number of words to generate |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
236 :type num_words: int |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
237 :param context: initial words in generated string |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
238 :type context: list(str) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
239 ''' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
240 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
241 text = list(context) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
242 for i in range(num_words): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
243 text.append(self._generate_one(text)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
244 return text |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
245 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
246 def _generate_one(self, context): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
247 context = (self._lpad + tuple(context))[- self._n + 1:] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
248 if context in self: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
249 return self[context].generate() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
250 elif self._n > 1: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
251 return self._backoff._generate_one(context[1:]) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
252 else: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
253 return '.' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
254 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
255 def entropy(self, text): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
256 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
257 Calculate the approximate cross-entropy of the n-gram model for a |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
258 given evaluation text. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
259 This is the average log probability of each word in the text. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
260 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
261 :param text: words to use for evaluation |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
262 :type text: list(str) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
263 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
264 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
265 e = 0.0 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
266 text = list(self._lpad) + text + list(self._rpad) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
267 for i in range(self._n - 1, len(text)): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
268 context = tuple(text[i - self._n + 1:i]) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
269 token = text[i] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
270 e += self.logprob(token, context) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
271 return e / float(len(text) - (self._n - 1)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
272 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
273 def perplexity(self, text): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
274 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
275 Calculates the perplexity of the given text. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
276 This is simply 2 ** cross-entropy for the text. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
277 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
278 :param text: words to calculate perplexity of |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
279 :type text: list(str) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
280 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
281 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
282 return pow(2.0, self.entropy(text)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
283 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
284 def __contains__(self, item): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
285 return tuple(item) in self._probdist.freqdist |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
286 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
287 def __getitem__(self, item): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
288 return self._probdist[tuple(item)] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
289 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
290 def __repr__(self): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
291 return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
292 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
293 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
294 def teardown_module(module=None): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
295 from nltk.corpus import brown |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
296 brown._unload() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
297 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
298 if __name__ == "__main__": |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
299 import doctest |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
300 doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) |