Mercurial > hg > python
diff twitter.py @ 0:fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 09 Mar 2020 14:58:04 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/twitter.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,113 @@ +from nltk.corpus.reader.plaintext import PlaintextCorpusReader +from nltk.corpus.reader import RegexpTokenizer +from nltk.tokenize import LineTokenizer +from nltk.corpus.reader.util import read_line_block +from nltkx.model import NgramModel +from nltk import ConditionalFreqDist, ngrams,\ + chain, ConditionalProbDist, WittenBellProbDist, FreqDist +import types + +xtwc=PlaintextCorpusReader("/group/ltg/projects/fnlp/", + r'2.*\.txt', + word_tokenizer=RegexpTokenizer(r'(http|ftp|mailto)://[^\s]+|[\w#@]+|[^\w\s]+'), + sent_tokenizer=LineTokenizer(), + para_block_reader=read_line_block) + +def discount(self): + return float(self._N)/float(self._N + self._T) + +def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): + # http://stackoverflow.com/a/33024979 + return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) + +def check(self): + totProb=sum(self.prob(sample) for sample in self.samples()) + assert isclose(self.discount(),totProb),\ + "discount %s != totProb %s"%(self.discount(),totProb) + + +WittenBellProbDist.discount = discount +WittenBellProbDist.check = check + +def _estimator(fdist, bins): + """ + Default estimator function using WB. + """ + # can't be an instance method of NgramModel as they + # can't be pickled either. + res=WittenBellProbDist(fdist,fdist.B()+1) + res.check() + return res + +class LgramModel(NgramModel): + def __init__(self, n, train, pad_left=False, pad_right=False, + estimator=None, *estimator_args, **estimator_kwargs): + """ + Same as NgramModel (q.v.), but with a WittenBell default estimator + """ + if estimator is None: + assert (not(estimator_args)) and (not(estimator_kwargs)),\ + "estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs) + estimator=_estimator + super(LgramModel,self).__init__(n, train, pad_left, pad_right, + estimator, + *estimator_args, **estimator_kwargs) + +from nltk.probability import _get_kwarg +try: + from nltk.probability import islice +except: + from nltk.util import islice + +def plotSorted(self, *args, **kwargs): + """ + Plot samples from the frequency distribution, + sorted using a supplied key function. If an integer + parameter is supplied, stop after this many samples have been + plotted. If two integer parameters m, n are supplied, plot a + subset of the samples, beginning with m and stopping at n-1. + For a cumulative plot, specify cumulative=True. + (Requires Matplotlib to be installed.) + + :param title: The title for the graph + :type title: str + :param key: a function to pass to sort to extract the sort key + given an FD and a sample id. + Defaults to the value of that sample's entry, + lambda fd,s:fd[s] + :type key: function + :param reverse: True to sort high to low + :type reverse: bool + """ + try: + import pylab + except ImportError: + raise ValueError('The plot function requires the matplotlib package (aka pylab). ' + 'See http://matplotlib.sourceforge.net/') + + if len(args) == 0: + args = [len(self)] + + keyFn = _get_kwarg(kwargs, 'key', lambda fd,s:fd[s]) + reverse = _get_kwarg(kwargs, 'reverse', False) + + samples = list(islice(self, *args)) + samples.sort(key=lambda x:keyFn(self,x),reverse=reverse) + + freqs = [self[sample] for sample in samples] + ylabel = "Counts" + # percents = [f * 100 for f in freqs] only in ProbDist? + + pylab.grid(True, color="silver") + if not "linewidth" in kwargs: + kwargs["linewidth"] = 2 + if "title" in kwargs: + pylab.title(kwargs["title"]) + del kwargs["title"] + pylab.plot(freqs, **kwargs) + pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90) + pylab.xlabel("Samples") + pylab.ylabel(ylabel) + pylab.show() + +FreqDist.plotSorted=plotSorted