python: twitter.py comparison

comparison twitter.py @ 0:fee51ab07d09

blanket publication of all existing python files in lib/python on maritain

author	Henry S. Thompson <ht@inf.ed.ac.uk>
date	Mon, 09 Mar 2020 14:58:04 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:fee51ab07d09
+from nltk.corpus.reader.plaintext import PlaintextCorpusReader
+from nltk.corpus.reader import RegexpTokenizer
+from nltk.tokenize import LineTokenizer
+from nltk.corpus.reader.util import read_line_block
+from nltkx.model import NgramModel
+from nltk import ConditionalFreqDist, ngrams,\
+chain, ConditionalProbDist, WittenBellProbDist, FreqDist
+import types
+xtwc=PlaintextCorpusReader("/group/ltg/projects/fnlp/",
+r'2.*\.txt',
+word_tokenizer=RegexpTokenizer(r'(http|ftp|mailto)://[^\s]+|[\w#@]+|[^\w\s]+'),
+sent_tokenizer=LineTokenizer(),
+para_block_reader=read_line_block)
+def discount(self):
+return float(self._N)/float(self._N + self._T)
+def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
+# http://stackoverflow.com/a/33024979
+return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+def check(self):
+totProb=sum(self.prob(sample) for sample in self.samples())
+assert isclose(self.discount(),totProb),\
+"discount %s != totProb %s"%(self.discount(),totProb)
+WittenBellProbDist.discount = discount
+WittenBellProbDist.check = check
+def _estimator(fdist, bins):
+"""
+Default estimator function using WB.
+"""
+# can't be an instance method of NgramModel as they
+# can't be pickled either.
+res=WittenBellProbDist(fdist,fdist.B()+1)
+res.check()
+return res
+class LgramModel(NgramModel):
+def __init__(self, n, train, pad_left=False, pad_right=False,
+estimator=None, *estimator_args, **estimator_kwargs):
+"""
+Same as NgramModel (q.v.), but with a WittenBell default estimator
+"""
+if estimator is None:
+assert (not(estimator_args)) and (not(estimator_kwargs)),\
+"estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs)
+estimator=_estimator
+super(LgramModel,self).__init__(n, train, pad_left, pad_right,
+estimator,
+*estimator_args, **estimator_kwargs)
+from nltk.probability import _get_kwarg
+try:
+from nltk.probability import islice
+except:
+from nltk.util import islice
+def plotSorted(self, *args, **kwargs):
+"""
+Plot samples from the frequency distribution,
+sorted using a supplied key function.  If an integer
+parameter is supplied, stop after this many samples have been
+plotted.  If two integer parameters m, n are supplied, plot a
+subset of the samples, beginning with m and stopping at n-1.
+For a cumulative plot, specify cumulative=True.
+(Requires Matplotlib to be installed.)
+:param title: The title for the graph
+:type title: str
+:param key: a function to pass to sort to extract the sort key
+given an FD and a sample id.
+Defaults to the value of that sample's entry,
+lambda fd,s:fd[s]
+:type key: function
+:param reverse: True to sort high to low
+:type reverse: bool
+"""
+try:
+import pylab
+except ImportError:
+raise ValueError('The plot function requires the matplotlib package (aka pylab). '
+'See http://matplotlib.sourceforge.net/')
+if len(args) == 0:
+args = [len(self)]
+keyFn = _get_kwarg(kwargs, 'key', lambda fd,s:fd[s])
+reverse = _get_kwarg(kwargs, 'reverse', False)
+samples = list(islice(self, *args))
+samples.sort(key=lambda x:keyFn(self,x),reverse=reverse)
+freqs = [self[sample] for sample in samples]
+ylabel = "Counts"
+# percents = [f * 100 for f in freqs]  only in ProbDist?
+pylab.grid(True, color="silver")
+if not "linewidth" in kwargs:
+kwargs["linewidth"] = 2
+if "title" in kwargs:
+pylab.title(kwargs["title"])
+del kwargs["title"]
+pylab.plot(freqs, **kwargs)
+pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90)
+pylab.xlabel("Samples")
+pylab.ylabel(ylabel)
+pylab.show()
+FreqDist.plotSorted=plotSorted

Mercurial > hg > python

comparison twitter.py @ 0:fee51ab07d09