diff twitter.py @ 0:fee51ab07d09

blanket publication of all existing python files in lib/python on maritain
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 09 Mar 2020 14:58:04 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/twitter.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,113 @@
+from nltk.corpus.reader.plaintext import PlaintextCorpusReader
+from nltk.corpus.reader import RegexpTokenizer
+from nltk.tokenize import LineTokenizer
+from nltk.corpus.reader.util import read_line_block
+from nltkx.model import NgramModel
+from nltk import ConditionalFreqDist, ngrams,\
+     chain, ConditionalProbDist, WittenBellProbDist, FreqDist
+import types
+
+xtwc=PlaintextCorpusReader("/group/ltg/projects/fnlp/",
+                          r'2.*\.txt',
+                          word_tokenizer=RegexpTokenizer(r'(http|ftp|mailto)://[^\s]+|[\w#@]+|[^\w\s]+'),
+                          sent_tokenizer=LineTokenizer(),
+                          para_block_reader=read_line_block)
+
+def discount(self):
+    return float(self._N)/float(self._N + self._T)
+
+def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
+    # http://stackoverflow.com/a/33024979
+    return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+
+def check(self):
+    totProb=sum(self.prob(sample) for sample in self.samples())
+    assert isclose(self.discount(),totProb),\
+           "discount %s != totProb %s"%(self.discount(),totProb)
+           
+
+WittenBellProbDist.discount = discount
+WittenBellProbDist.check = check
+
+def _estimator(fdist, bins):
+    """
+    Default estimator function using WB.
+    """
+    # can't be an instance method of NgramModel as they
+    # can't be pickled either.
+    res=WittenBellProbDist(fdist,fdist.B()+1)
+    res.check()
+    return res
+
+class LgramModel(NgramModel):
+    def __init__(self, n, train, pad_left=False, pad_right=False,
+                 estimator=None, *estimator_args, **estimator_kwargs):
+        """
+        Same as NgramModel (q.v.), but with a WittenBell default estimator
+        """
+        if estimator is None:
+            assert (not(estimator_args)) and (not(estimator_kwargs)),\
+                   "estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs)
+            estimator=_estimator
+        super(LgramModel,self).__init__(n, train, pad_left, pad_right,
+                                        estimator,
+                                        *estimator_args, **estimator_kwargs)
+
+from nltk.probability import _get_kwarg
+try:
+    from nltk.probability import islice
+except:
+    from nltk.util import islice
+
+def plotSorted(self, *args, **kwargs):
+        """
+        Plot samples from the frequency distribution,
+        sorted using a supplied key function.  If an integer
+        parameter is supplied, stop after this many samples have been
+        plotted.  If two integer parameters m, n are supplied, plot a
+        subset of the samples, beginning with m and stopping at n-1.
+        For a cumulative plot, specify cumulative=True.
+        (Requires Matplotlib to be installed.)
+
+        :param title: The title for the graph
+        :type title: str
+        :param key: a function to pass to sort to extract the sort key
+          given an FD and a sample id.
+          Defaults to the value of that sample's entry,
+          lambda fd,s:fd[s]
+        :type key: function
+        :param reverse: True to sort high to low
+        :type reverse: bool
+        """
+        try:
+            import pylab
+        except ImportError:
+            raise ValueError('The plot function requires the matplotlib package (aka pylab). '
+                         'See http://matplotlib.sourceforge.net/')
+
+        if len(args) == 0:
+            args = [len(self)]
+
+        keyFn = _get_kwarg(kwargs, 'key', lambda fd,s:fd[s])
+        reverse = _get_kwarg(kwargs, 'reverse', False)
+
+        samples = list(islice(self, *args))
+        samples.sort(key=lambda x:keyFn(self,x),reverse=reverse)
+
+        freqs = [self[sample] for sample in samples]
+        ylabel = "Counts"
+        # percents = [f * 100 for f in freqs]  only in ProbDist?
+
+        pylab.grid(True, color="silver")
+        if not "linewidth" in kwargs:
+            kwargs["linewidth"] = 2
+        if "title" in kwargs:
+            pylab.title(kwargs["title"])
+            del kwargs["title"]
+        pylab.plot(freqs, **kwargs)
+        pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90)
+        pylab.xlabel("Samples")
+        pylab.ylabel(ylabel)
+        pylab.show()
+
+FreqDist.plotSorted=plotSorted