Mercurial > hg > python
annotate twitter.py @ 23:1670a33e3e6d
from markup
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 29 May 2021 11:07:34 +0100 |
parents | fee51ab07d09 |
children |
rev | line source |
---|---|
0
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 from nltk.corpus.reader.plaintext import PlaintextCorpusReader |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 from nltk.corpus.reader import RegexpTokenizer |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 from nltk.tokenize import LineTokenizer |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 from nltk.corpus.reader.util import read_line_block |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 from nltkx.model import NgramModel |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 from nltk import ConditionalFreqDist, ngrams,\ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 chain, ConditionalProbDist, WittenBellProbDist, FreqDist |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 import types |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 xtwc=PlaintextCorpusReader("/group/ltg/projects/fnlp/", |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 r'2.*\.txt', |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 word_tokenizer=RegexpTokenizer(r'(http|ftp|mailto)://[^\s]+|[\w#@]+|[^\w\s]+'), |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 sent_tokenizer=LineTokenizer(), |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 para_block_reader=read_line_block) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 def discount(self): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 return float(self._N)/float(self._N + self._T) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 # http://stackoverflow.com/a/33024979 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 def check(self): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 totProb=sum(self.prob(sample) for sample in self.samples()) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 assert isclose(self.discount(),totProb),\ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 "discount %s != totProb %s"%(self.discount(),totProb) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 WittenBellProbDist.discount = discount |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 WittenBellProbDist.check = check |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 def _estimator(fdist, bins): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 Default estimator function using WB. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 # can't be an instance method of NgramModel as they |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 # can't be pickled either. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 res=WittenBellProbDist(fdist,fdist.B()+1) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 res.check() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 return res |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 class LgramModel(NgramModel): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 def __init__(self, n, train, pad_left=False, pad_right=False, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 estimator=None, *estimator_args, **estimator_kwargs): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 Same as NgramModel (q.v.), but with a WittenBell default estimator |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 if estimator is None: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 assert (not(estimator_args)) and (not(estimator_kwargs)),\ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 "estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 estimator=_estimator |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 super(LgramModel,self).__init__(n, train, pad_left, pad_right, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 estimator, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
54 *estimator_args, **estimator_kwargs) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
55 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
56 from nltk.probability import _get_kwarg |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
57 try: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
58 from nltk.probability import islice |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
59 except: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
60 from nltk.util import islice |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
61 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
62 def plotSorted(self, *args, **kwargs): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
63 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
64 Plot samples from the frequency distribution, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
65 sorted using a supplied key function. If an integer |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
66 parameter is supplied, stop after this many samples have been |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
67 plotted. If two integer parameters m, n are supplied, plot a |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
68 subset of the samples, beginning with m and stopping at n-1. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
69 For a cumulative plot, specify cumulative=True. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
70 (Requires Matplotlib to be installed.) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
71 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
72 :param title: The title for the graph |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
73 :type title: str |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
74 :param key: a function to pass to sort to extract the sort key |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
75 given an FD and a sample id. |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
76 Defaults to the value of that sample's entry, |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
77 lambda fd,s:fd[s] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
78 :type key: function |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
79 :param reverse: True to sort high to low |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
80 :type reverse: bool |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
81 """ |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
82 try: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
83 import pylab |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
84 except ImportError: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
85 raise ValueError('The plot function requires the matplotlib package (aka pylab). ' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
86 'See http://matplotlib.sourceforge.net/') |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
87 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
88 if len(args) == 0: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
89 args = [len(self)] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
90 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
91 keyFn = _get_kwarg(kwargs, 'key', lambda fd,s:fd[s]) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
92 reverse = _get_kwarg(kwargs, 'reverse', False) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
93 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
94 samples = list(islice(self, *args)) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
95 samples.sort(key=lambda x:keyFn(self,x),reverse=reverse) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
96 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
97 freqs = [self[sample] for sample in samples] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
98 ylabel = "Counts" |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
99 # percents = [f * 100 for f in freqs] only in ProbDist? |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
100 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
101 pylab.grid(True, color="silver") |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
102 if not "linewidth" in kwargs: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
103 kwargs["linewidth"] = 2 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
104 if "title" in kwargs: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
105 pylab.title(kwargs["title"]) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
106 del kwargs["title"] |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
107 pylab.plot(freqs, **kwargs) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
108 pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
109 pylab.xlabel("Samples") |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
110 pylab.ylabel(ylabel) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
111 pylab.show() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
112 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
113 FreqDist.plotSorted=plotSorted |