Mercurial > hg > python
view twitter.py @ 13:70993b538ddb
works on 5a
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 20 Mar 2020 19:15:42 +0000 |
parents | fee51ab07d09 |
children |
line wrap: on
line source
from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.corpus.reader import RegexpTokenizer from nltk.tokenize import LineTokenizer from nltk.corpus.reader.util import read_line_block from nltkx.model import NgramModel from nltk import ConditionalFreqDist, ngrams,\ chain, ConditionalProbDist, WittenBellProbDist, FreqDist import types xtwc=PlaintextCorpusReader("/group/ltg/projects/fnlp/", r'2.*\.txt', word_tokenizer=RegexpTokenizer(r'(http|ftp|mailto)://[^\s]+|[\w#@]+|[^\w\s]+'), sent_tokenizer=LineTokenizer(), para_block_reader=read_line_block) def discount(self): return float(self._N)/float(self._N + self._T) def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): # http://stackoverflow.com/a/33024979 return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) def check(self): totProb=sum(self.prob(sample) for sample in self.samples()) assert isclose(self.discount(),totProb),\ "discount %s != totProb %s"%(self.discount(),totProb) WittenBellProbDist.discount = discount WittenBellProbDist.check = check def _estimator(fdist, bins): """ Default estimator function using WB. """ # can't be an instance method of NgramModel as they # can't be pickled either. res=WittenBellProbDist(fdist,fdist.B()+1) res.check() return res class LgramModel(NgramModel): def __init__(self, n, train, pad_left=False, pad_right=False, estimator=None, *estimator_args, **estimator_kwargs): """ Same as NgramModel (q.v.), but with a WittenBell default estimator """ if estimator is None: assert (not(estimator_args)) and (not(estimator_kwargs)),\ "estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs) estimator=_estimator super(LgramModel,self).__init__(n, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs) from nltk.probability import _get_kwarg try: from nltk.probability import islice except: from nltk.util import islice def plotSorted(self, *args, **kwargs): """ Plot samples from the frequency distribution, sorted using a supplied key function. If an integer parameter is supplied, stop after this many samples have been plotted. If two integer parameters m, n are supplied, plot a subset of the samples, beginning with m and stopping at n-1. For a cumulative plot, specify cumulative=True. (Requires Matplotlib to be installed.) :param title: The title for the graph :type title: str :param key: a function to pass to sort to extract the sort key given an FD and a sample id. Defaults to the value of that sample's entry, lambda fd,s:fd[s] :type key: function :param reverse: True to sort high to low :type reverse: bool """ try: import pylab except ImportError: raise ValueError('The plot function requires the matplotlib package (aka pylab). ' 'See http://matplotlib.sourceforge.net/') if len(args) == 0: args = [len(self)] keyFn = _get_kwarg(kwargs, 'key', lambda fd,s:fd[s]) reverse = _get_kwarg(kwargs, 'reverse', False) samples = list(islice(self, *args)) samples.sort(key=lambda x:keyFn(self,x),reverse=reverse) freqs = [self[sample] for sample in samples] ylabel = "Counts" # percents = [f * 100 for f in freqs] only in ProbDist? pylab.grid(True, color="silver") if not "linewidth" in kwargs: kwargs["linewidth"] = 2 if "title" in kwargs: pylab.title(kwargs["title"]) del kwargs["title"] pylab.plot(freqs, **kwargs) pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90) pylab.xlabel("Samples") pylab.ylabel(ylabel) pylab.show() FreqDist.plotSorted=plotSorted