comparison twitter.py @ 0:fee51ab07d09

blanket publication of all existing python files in lib/python on maritain
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 09 Mar 2020 14:58:04 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:fee51ab07d09
1 from nltk.corpus.reader.plaintext import PlaintextCorpusReader
2 from nltk.corpus.reader import RegexpTokenizer
3 from nltk.tokenize import LineTokenizer
4 from nltk.corpus.reader.util import read_line_block
5 from nltkx.model import NgramModel
6 from nltk import ConditionalFreqDist, ngrams,\
7 chain, ConditionalProbDist, WittenBellProbDist, FreqDist
8 import types
9
10 xtwc=PlaintextCorpusReader("/group/ltg/projects/fnlp/",
11 r'2.*\.txt',
12 word_tokenizer=RegexpTokenizer(r'(http|ftp|mailto)://[^\s]+|[\w#@]+|[^\w\s]+'),
13 sent_tokenizer=LineTokenizer(),
14 para_block_reader=read_line_block)
15
16 def discount(self):
17 return float(self._N)/float(self._N + self._T)
18
19 def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
20 # http://stackoverflow.com/a/33024979
21 return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
22
23 def check(self):
24 totProb=sum(self.prob(sample) for sample in self.samples())
25 assert isclose(self.discount(),totProb),\
26 "discount %s != totProb %s"%(self.discount(),totProb)
27
28
29 WittenBellProbDist.discount = discount
30 WittenBellProbDist.check = check
31
32 def _estimator(fdist, bins):
33 """
34 Default estimator function using WB.
35 """
36 # can't be an instance method of NgramModel as they
37 # can't be pickled either.
38 res=WittenBellProbDist(fdist,fdist.B()+1)
39 res.check()
40 return res
41
42 class LgramModel(NgramModel):
43 def __init__(self, n, train, pad_left=False, pad_right=False,
44 estimator=None, *estimator_args, **estimator_kwargs):
45 """
46 Same as NgramModel (q.v.), but with a WittenBell default estimator
47 """
48 if estimator is None:
49 assert (not(estimator_args)) and (not(estimator_kwargs)),\
50 "estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs)
51 estimator=_estimator
52 super(LgramModel,self).__init__(n, train, pad_left, pad_right,
53 estimator,
54 *estimator_args, **estimator_kwargs)
55
56 from nltk.probability import _get_kwarg
57 try:
58 from nltk.probability import islice
59 except:
60 from nltk.util import islice
61
62 def plotSorted(self, *args, **kwargs):
63 """
64 Plot samples from the frequency distribution,
65 sorted using a supplied key function. If an integer
66 parameter is supplied, stop after this many samples have been
67 plotted. If two integer parameters m, n are supplied, plot a
68 subset of the samples, beginning with m and stopping at n-1.
69 For a cumulative plot, specify cumulative=True.
70 (Requires Matplotlib to be installed.)
71
72 :param title: The title for the graph
73 :type title: str
74 :param key: a function to pass to sort to extract the sort key
75 given an FD and a sample id.
76 Defaults to the value of that sample's entry,
77 lambda fd,s:fd[s]
78 :type key: function
79 :param reverse: True to sort high to low
80 :type reverse: bool
81 """
82 try:
83 import pylab
84 except ImportError:
85 raise ValueError('The plot function requires the matplotlib package (aka pylab). '
86 'See http://matplotlib.sourceforge.net/')
87
88 if len(args) == 0:
89 args = [len(self)]
90
91 keyFn = _get_kwarg(kwargs, 'key', lambda fd,s:fd[s])
92 reverse = _get_kwarg(kwargs, 'reverse', False)
93
94 samples = list(islice(self, *args))
95 samples.sort(key=lambda x:keyFn(self,x),reverse=reverse)
96
97 freqs = [self[sample] for sample in samples]
98 ylabel = "Counts"
99 # percents = [f * 100 for f in freqs] only in ProbDist?
100
101 pylab.grid(True, color="silver")
102 if not "linewidth" in kwargs:
103 kwargs["linewidth"] = 2
104 if "title" in kwargs:
105 pylab.title(kwargs["title"])
106 del kwargs["title"]
107 pylab.plot(freqs, **kwargs)
108 pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90)
109 pylab.xlabel("Samples")
110 pylab.ylabel(ylabel)
111 pylab.show()
112
113 FreqDist.plotSorted=plotSorted