diff ngram.py @ 0:fee51ab07d09

blanket publication of all existing python files in lib/python on maritain
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 09 Mar 2020 14:58:04 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngram.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,345 @@
+# Natural Language Toolkit: Language Models
+#
+# Copyright (C) 2001-2009 NLTK Project
+# Author: Steven Bird <sb@csse.unimelb.edu.au>
+# URL: <http://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import random, types
+from itertools import chain
+from math import log
+
+from nltk.probability import (ConditionalProbDist, ConditionalFreqDist,
+                              MLEProbDist, FreqDist)
+try:
+    from nltk.util import ingrams
+except:
+    from nltkx.util import ingrams
+
+from api import *
+
+class NgramModel(ModelI):
+    """
+    A processing interface for assigning a probability to the next word.
+    """
+
+    def __init__(self, n, train, pad_left=False, pad_right=False,
+                 estimator=None, *estimator_args, **estimator_kwargs):
+        """
+        Creates an ngram language model to capture patterns in n consecutive
+        words of training text.  An estimator smooths the probabilities derived
+        from the text and may allow generation of ngrams not seen during
+        training.
+
+        @param n: the order of the language model (ngram size)
+        @type n: C{int}
+        @param train: the training text
+        @type train: C{list} of C{list} of C{string} 
+        @param estimator: a function for generating a probability distribution
+        @type estimator: a function that takes a C{ConditionalFreqDist} and
+              returns a C{ConditionalProbDist}
+        @param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
+        @type pad_left: bool
+        @param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
+        @type pad_right: bool
+        @param estimator_args: Extra arguments for estimator.
+            These arguments are usually used to specify extra
+            properties for the probability distributions of individual
+            conditions, such as the number of bins they contain.
+            Note: For backward-compatibility, if no arguments are specified, the
+            number of bins in the underlying ConditionalFreqDist are passed to
+            the estimator as an argument.
+        @type estimator_args: (any)
+        @param estimator_kwargs: Extra keyword arguments for the estimator
+        @type estimator_kwargs: (any)
+        """
+        # protection from cryptic behavior for calling programs
+        # that use the pre-2.0.2 interface
+        assert(isinstance(pad_left, bool))
+        assert(isinstance(pad_right, bool))
+
+        self._n = n
+        self._W = len(train)
+        self._lpad = ('<s>',) * (n - 1) if pad_left else ()
+        # Need _rpad even for unigrams or padded entropy will give
+        #  wrong answer because '' will be treated as unseen...
+        self._rpad = ('</s>',) * (max(1,(n - 1))) if pad_right else ()
+        self._padLen = len(self._lpad)+len(self._rpad)
+
+        self._N=0
+        delta = 1+self._padLen-n        # len(sent)+delta == ngrams in sent
+
+        if estimator is None:
+            assert (estimator_args is None) and (estimator_kwargs is None),\
+                   "estimator_args or _kwargs supplied, but no estimator"
+            estimator = lambda fdist, bins: MLEProbDist(fdist)
+
+        # Given backoff, a generator isn't acceptable
+        if isinstance(train,types.GeneratorType):
+          train=list(train)
+
+        if n == 1:
+            if pad_right:
+                sents=(chain(s,self._rpad) for s in train)
+            else:
+                sents=train
+            fd=FreqDist()
+            for s in sents:
+                fd.update(s)
+            if not estimator_args and not estimator_kwargs:
+                self._model = estimator(fd,fd.B())
+            else:
+                self._model = estimator(fd,fd.B(),
+                                        *estimator_args, **estimator_kwargs)
+            self._N=fd.N()
+        else:
+            cfd = ConditionalFreqDist()
+            self._ngrams = set()
+
+            for sent in train:
+                self._N+=len(sent)+delta
+                for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
+                    self._ngrams.add(ngram)
+                    context = tuple(ngram[:-1])
+                    token = ngram[-1]
+                    cfd[context][token]+=1
+            if not estimator_args and not estimator_kwargs:
+                self._model = ConditionalProbDist(cfd, estimator, len(cfd))
+            else:
+                self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)
+
+        # recursively construct the lower-order models
+        if n > 1:
+            self._backoff = NgramModel(n-1, train, pad_left, pad_right,
+                                       estimator, *estimator_args, **estimator_kwargs)
+
+            # Code below here in this method, and the _words_following and _alpha method, are from
+            # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015"
+            self._backoff_alphas = dict()
+            # For each condition (or context)
+            #print cfd,cfd.conditions()
+            for ctxt in cfd.conditions():
+                backoff_ctxt = ctxt[1:]
+                backoff_total_pr = 0.0
+                total_observed_pr = 0.0
+
+                # this is the subset of words that we OBSERVED following
+                # this context.
+                # i.e. Count(word | context) > 0
+                wf=list(self._words_following(ctxt, cfd))
+                for word in self._words_following(ctxt, cfd):
+                    total_observed_pr += self.prob(word, ctxt)
+                    # we also need the total (n-1)-gram probability of
+                    # words observed in this n-gram context
+                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
+                assert (0 <= total_observed_pr <= 1),\
+                       "sum of probs for %s out of bounds: %s"%(ctxt,total_observed_pr)
+                # beta is the remaining probability weight after we factor out
+                # the probability of observed words.
+                # As a sanity check, both total_observed_pr and backoff_total_pr
+                # must be GE 0, since probabilities are never negative
+                beta = 1.0 - total_observed_pr
+
+                # if backoff total is 1, that should mean that all samples occur in this context,
+                #  so we will never back off.
+                # Greater than 1 is an error.
+                assert (0 <= backoff_total_pr < 1), \
+                       "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
+                alpha_ctxt = beta / (1.0 - backoff_total_pr)
+
+                self._backoff_alphas[ctxt] = alpha_ctxt
+
+    def _words_following(self, context, cond_freq_dist):
+        return cond_freq_dist[context].iterkeys()
+        # below from http://www.nltk.org/_modules/nltk/model/ngram.html,
+        # depends on new CFD???
+        #for ctxt, word in cond_freq_dist.iterkeys():
+        #    if ctxt == context:
+        #        yield word
+
+    def prob(self, word, context, verbose=False):
+        """
+        Evaluate the probability of this word in this context
+        using Katz Backoff.
+        """
+        assert(isinstance(word,types.StringTypes))
+        context = tuple(context)
+        if self._n==1:
+            if not(self._model.SUM_TO_ONE):
+                # Smoothing models should do the right thing for unigrams
+                #  even if they're 'absent'
+                return self._model.prob(word)
+            else:
+                try:
+                    return self._model.prob(word)
+                except:
+                    raise RuntimeError("No probability mass assigned"
+                                       "to unigram %s" % (word))
+        if context + (word,) in self._ngrams:
+            return self[context].prob(word)
+        else:
+            alpha=self._alpha(context)
+            if alpha>0:
+                if verbose:
+                    print "backing off for %s"%(context+(word,),)
+                return alpha * self._backoff.prob(word, context[1:],verbose)
+            else:
+                if verbose:
+                    print "no backoff for %s as model doesn't do any smoothing"%word
+                return alpha
+
+    def _alpha(self, context,verbose=False):
+        """Get the backoff alpha value for the given context
+        """
+        error_message = "Alphas and backoff are not defined for unigram models"
+        assert (not self._n == 1), error_message
+
+        if context in self._backoff_alphas:
+            res = self._backoff_alphas[context]
+        else:
+            res = 1
+        if verbose:
+            print " alpha: %s = %s"%(context,res)
+        return res
+
+
+    def logprob(self, word, context,verbose=False):
+        """
+        Evaluate the (negative) log probability of this word in this context.
+        """
+
+        return -log(self.prob(word, context,verbose), 2)
+
+    # NB, this will always start with same word since model
+    # is trained on a single text
+    def generate(self, num_words, context=()):
+        '''Generate random text based on the language model.'''
+        text = list(context)
+        for i in range(num_words):
+            text.append(self._generate_one(text))
+        return text
+
+    def _generate_one(self, context):
+        context = (self._prefix + tuple(context))[-self._n+1:]
+        # print "Context (%d): <%s>" % (self._n, ','.join(context))
+        if context in self:
+            return self[context].generate()
+        elif self._n > 1:
+            return self._backoff._generate_one(context[1:])
+        else:
+            return '.'
+
+    def entropy(self, text, pad_left=False, pad_right=False,
+                verbose=False, perItem=False):
+        """
+        Evaluate the total entropy of a text with respect to the model.
+        This is the sum of the log probability of each word in the message.
+        """
+        # This version takes account of padding for greater accuracy
+        e = 0.0
+        for ngram in ngrams(chain(self._lpad, text, self._rpad), self._n):
+            context = tuple(ngram[:-1])
+            token = ngram[-1]
+            cost=self.logprob(token, context, verbose)  # _negative_
+                                                        # log2 prob == cost!
+            if verbose:
+                print "p(%s|%s) = [%s-gram] %7f"%(token,context,self._n,2**-cost)
+            e += cost
+        if perItem:
+            return e/((len(text)+self._padLen)-(self._n - 1))
+        else:
+            return e
+
+    def dump(self, file, logBase=None, precision=7):
+        """Dump this model in SRILM/ARPA/Doug Paul format
+
+        Use logBase=10 and the default precision to get something comparable
+        to SRILM ngram-model -lm output
+        @param file to dump to
+        @type file file
+        @param logBase If not None, output logBases to the specified base
+        @type logBase int|None"""
+        file.write('\n\\data\\\n')
+        self._writeLens(file)
+        self._writeModels(file,logBase,precision,None)
+        file.write('\\end\\\n')
+
+    def _writeLens(self,file):
+        if self._n>1:
+            self._backoff._writeLens(file)
+            file.write('ngram %s=%s\n'%(self._n,
+                                        sum(len(self._model[c].samples())\
+                                            for c in self._model.keys())))
+        else:
+            file.write('ngram 1=%s\n'%len(self._model.samples()))
+            
+
+    def _writeModels(self,file,logBase,precision,alphas):
+        if self._n>1:
+            self._backoff._writeModels(file,logBase,precision,self._backoff_alphas)
+        file.write('\n\\%s-grams:\n'%self._n)
+        if self._n==1:
+            self._writeProbs(self._model,file,logBase,precision,(),alphas)
+        else:
+            for c in sorted(self._model.conditions()):
+                self._writeProbs(self._model[c],file,logBase,precision,
+                                  c,alphas)
+
+    def _writeProbs(self,pd,file,logBase,precision,ctxt,alphas):
+        if self._n==1:
+            for k in sorted(pd.samples()+['<unk>','<s>']):
+                if k=='<s>':
+                    file.write('-99')
+                elif k=='<unk>':
+                    _writeProb(file,logBase,precision,1-pd.discount()) 
+                else:
+                    _writeProb(file,logBase,precision,pd.prob(k))
+                file.write('\t%s'%k)
+                if k not in ('</s>','<unk>'):
+                    file.write('\t')
+                    _writeProb(file,logBase,precision,alphas[ctxt+(k,)])
+                file.write('\n')
+        else:
+            ctxtString=' '.join(ctxt)
+            for k in sorted(pd.samples()):
+                _writeProb(file,logBase,precision,pd.prob(k))
+                file.write('\t%s %s'%(ctxtString,k))
+                if alphas is not None:
+                    file.write('\t')
+                    _writeProb(file,logBase,precision,alphas[ctxt+(k,)])
+                file.write('\n')
+
+    def __contains__(self, item):
+        try:
+            return item in self._model
+        except:
+            try:
+                # hack if model is an MLEProbDist, more efficient
+                return item in self._model._freqdist
+            except:
+                return item in self._model.samples()
+
+    def __getitem__(self, item):
+        return self._model[item]
+
+    def __repr__(self):
+        return '<NgramModel with %d %d-grams>' % (self._N, self._n)
+
+def _writeProb(file,logBase,precision,p):
+    file.write('%.*g'%(precision,
+                       p if logBase is None else log(p,logBase)))
+
+def demo():
+    from nltk.corpus import brown
+    from nltk.probability import LidstoneProbDist, WittenBellProbDist
+    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
+#    estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
+    lm = NgramModel(3, brown.words(categories='news'), estimator)
+    print lm
+#    print lm.entropy(sent)
+    text = lm.generate(100)
+    import textwrap
+    print '\n'.join(textwrap.wrap(' '.join(text)))
+
+if __name__ == '__main__':
+    demo()