view twitter.py @ 30:9f40651f6080

works, but not quite right for cc simulation
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 29 Jul 2021 10:21:51 +0100
parents fee51ab07d09
children
line wrap: on
line source

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus.reader import RegexpTokenizer
from nltk.tokenize import LineTokenizer
from nltk.corpus.reader.util import read_line_block
from nltkx.model import NgramModel
from nltk import ConditionalFreqDist, ngrams,\
     chain, ConditionalProbDist, WittenBellProbDist, FreqDist
import types

xtwc=PlaintextCorpusReader("/group/ltg/projects/fnlp/",
                          r'2.*\.txt',
                          word_tokenizer=RegexpTokenizer(r'(http|ftp|mailto)://[^\s]+|[\w#@]+|[^\w\s]+'),
                          sent_tokenizer=LineTokenizer(),
                          para_block_reader=read_line_block)

def discount(self):
    return float(self._N)/float(self._N + self._T)

def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
    # http://stackoverflow.com/a/33024979
    return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

def check(self):
    totProb=sum(self.prob(sample) for sample in self.samples())
    assert isclose(self.discount(),totProb),\
           "discount %s != totProb %s"%(self.discount(),totProb)
           

WittenBellProbDist.discount = discount
WittenBellProbDist.check = check

def _estimator(fdist, bins):
    """
    Default estimator function using WB.
    """
    # can't be an instance method of NgramModel as they
    # can't be pickled either.
    res=WittenBellProbDist(fdist,fdist.B()+1)
    res.check()
    return res

class LgramModel(NgramModel):
    def __init__(self, n, train, pad_left=False, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        """
        Same as NgramModel (q.v.), but with a WittenBell default estimator
        """
        if estimator is None:
            assert (not(estimator_args)) and (not(estimator_kwargs)),\
                   "estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs)
            estimator=_estimator
        super(LgramModel,self).__init__(n, train, pad_left, pad_right,
                                        estimator,
                                        *estimator_args, **estimator_kwargs)

from nltk.probability import _get_kwarg
try:
    from nltk.probability import islice
except:
    from nltk.util import islice

def plotSorted(self, *args, **kwargs):
        """
        Plot samples from the frequency distribution,
        sorted using a supplied key function.  If an integer
        parameter is supplied, stop after this many samples have been
        plotted.  If two integer parameters m, n are supplied, plot a
        subset of the samples, beginning with m and stopping at n-1.
        For a cumulative plot, specify cumulative=True.
        (Requires Matplotlib to be installed.)

        :param title: The title for the graph
        :type title: str
        :param key: a function to pass to sort to extract the sort key
          given an FD and a sample id.
          Defaults to the value of that sample's entry,
          lambda fd,s:fd[s]
        :type key: function
        :param reverse: True to sort high to low
        :type reverse: bool
        """
        try:
            import pylab
        except ImportError:
            raise ValueError('The plot function requires the matplotlib package (aka pylab). '
                         'See http://matplotlib.sourceforge.net/')

        if len(args) == 0:
            args = [len(self)]

        keyFn = _get_kwarg(kwargs, 'key', lambda fd,s:fd[s])
        reverse = _get_kwarg(kwargs, 'reverse', False)

        samples = list(islice(self, *args))
        samples.sort(key=lambda x:keyFn(self,x),reverse=reverse)

        freqs = [self[sample] for sample in samples]
        ylabel = "Counts"
        # percents = [f * 100 for f in freqs]  only in ProbDist?

        pylab.grid(True, color="silver")
        if not "linewidth" in kwargs:
            kwargs["linewidth"] = 2
        if "title" in kwargs:
            pylab.title(kwargs["title"])
            del kwargs["title"]
        pylab.plot(freqs, **kwargs)
        pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90)
        pylab.xlabel("Samples")
        pylab.ylabel(ylabel)
        pylab.show()

FreqDist.plotSorted=plotSorted