annotate twitter.py @ 9:28a7b0e992cb

lots of fails in trace of 5a wrt checkNew/found0/foundN???
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 11 Mar 2020 22:06:04 +0000
parents fee51ab07d09
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 from nltk.corpus.reader.plaintext import PlaintextCorpusReader
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 from nltk.corpus.reader import RegexpTokenizer
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 from nltk.tokenize import LineTokenizer
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 from nltk.corpus.reader.util import read_line_block
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 from nltkx.model import NgramModel
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 from nltk import ConditionalFreqDist, ngrams,\
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 chain, ConditionalProbDist, WittenBellProbDist, FreqDist
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 import types
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 xtwc=PlaintextCorpusReader("/group/ltg/projects/fnlp/",
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 r'2.*\.txt',
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 word_tokenizer=RegexpTokenizer(r'(http|ftp|mailto)://[^\s]+|[\w#@]+|[^\w\s]+'),
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 sent_tokenizer=LineTokenizer(),
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 para_block_reader=read_line_block)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 def discount(self):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 return float(self._N)/float(self._N + self._T)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 # http://stackoverflow.com/a/33024979
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 def check(self):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 totProb=sum(self.prob(sample) for sample in self.samples())
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 assert isclose(self.discount(),totProb),\
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 "discount %s != totProb %s"%(self.discount(),totProb)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 WittenBellProbDist.discount = discount
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 WittenBellProbDist.check = check
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 def _estimator(fdist, bins):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 Default estimator function using WB.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 # can't be an instance method of NgramModel as they
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 # can't be pickled either.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 res=WittenBellProbDist(fdist,fdist.B()+1)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 res.check()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 return res
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 class LgramModel(NgramModel):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 def __init__(self, n, train, pad_left=False, pad_right=False,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 estimator=None, *estimator_args, **estimator_kwargs):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 Same as NgramModel (q.v.), but with a WittenBell default estimator
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 if estimator is None:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 assert (not(estimator_args)) and (not(estimator_kwargs)),\
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 "estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 estimator=_estimator
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 super(LgramModel,self).__init__(n, train, pad_left, pad_right,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 estimator,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 *estimator_args, **estimator_kwargs)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
56 from nltk.probability import _get_kwarg
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57 try:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 from nltk.probability import islice
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 except:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
60 from nltk.util import islice
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
61
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
62 def plotSorted(self, *args, **kwargs):
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
63 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
64 Plot samples from the frequency distribution,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
65 sorted using a supplied key function. If an integer
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
66 parameter is supplied, stop after this many samples have been
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
67 plotted. If two integer parameters m, n are supplied, plot a
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
68 subset of the samples, beginning with m and stopping at n-1.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
69 For a cumulative plot, specify cumulative=True.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
70 (Requires Matplotlib to be installed.)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
71
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
72 :param title: The title for the graph
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
73 :type title: str
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
74 :param key: a function to pass to sort to extract the sort key
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
75 given an FD and a sample id.
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
76 Defaults to the value of that sample's entry,
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
77 lambda fd,s:fd[s]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
78 :type key: function
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
79 :param reverse: True to sort high to low
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
80 :type reverse: bool
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
81 """
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
82 try:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
83 import pylab
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
84 except ImportError:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
85 raise ValueError('The plot function requires the matplotlib package (aka pylab). '
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
86 'See http://matplotlib.sourceforge.net/')
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
87
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
88 if len(args) == 0:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
89 args = [len(self)]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
90
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
91 keyFn = _get_kwarg(kwargs, 'key', lambda fd,s:fd[s])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
92 reverse = _get_kwarg(kwargs, 'reverse', False)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
93
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
94 samples = list(islice(self, *args))
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
95 samples.sort(key=lambda x:keyFn(self,x),reverse=reverse)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
96
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
97 freqs = [self[sample] for sample in samples]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
98 ylabel = "Counts"
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
99 # percents = [f * 100 for f in freqs] only in ProbDist?
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
100
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
101 pylab.grid(True, color="silver")
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
102 if not "linewidth" in kwargs:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
103 kwargs["linewidth"] = 2
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
104 if "title" in kwargs:
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
105 pylab.title(kwargs["title"])
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
106 del kwargs["title"]
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
107 pylab.plot(freqs, **kwargs)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
108 pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
109 pylab.xlabel("Samples")
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
110 pylab.ylabel(ylabel)
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
111 pylab.show()
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
112
fee51ab07d09 blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
113 FreqDist.plotSorted=plotSorted