Mercurial > hg > python
comparison twitter.py @ 0:fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 09 Mar 2020 14:58:04 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fee51ab07d09 |
---|---|
1 from nltk.corpus.reader.plaintext import PlaintextCorpusReader | |
2 from nltk.corpus.reader import RegexpTokenizer | |
3 from nltk.tokenize import LineTokenizer | |
4 from nltk.corpus.reader.util import read_line_block | |
5 from nltkx.model import NgramModel | |
6 from nltk import ConditionalFreqDist, ngrams,\ | |
7 chain, ConditionalProbDist, WittenBellProbDist, FreqDist | |
8 import types | |
9 | |
10 xtwc=PlaintextCorpusReader("/group/ltg/projects/fnlp/", | |
11 r'2.*\.txt', | |
12 word_tokenizer=RegexpTokenizer(r'(http|ftp|mailto)://[^\s]+|[\w#@]+|[^\w\s]+'), | |
13 sent_tokenizer=LineTokenizer(), | |
14 para_block_reader=read_line_block) | |
15 | |
16 def discount(self): | |
17 return float(self._N)/float(self._N + self._T) | |
18 | |
19 def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): | |
20 # http://stackoverflow.com/a/33024979 | |
21 return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) | |
22 | |
23 def check(self): | |
24 totProb=sum(self.prob(sample) for sample in self.samples()) | |
25 assert isclose(self.discount(),totProb),\ | |
26 "discount %s != totProb %s"%(self.discount(),totProb) | |
27 | |
28 | |
29 WittenBellProbDist.discount = discount | |
30 WittenBellProbDist.check = check | |
31 | |
32 def _estimator(fdist, bins): | |
33 """ | |
34 Default estimator function using WB. | |
35 """ | |
36 # can't be an instance method of NgramModel as they | |
37 # can't be pickled either. | |
38 res=WittenBellProbDist(fdist,fdist.B()+1) | |
39 res.check() | |
40 return res | |
41 | |
42 class LgramModel(NgramModel): | |
43 def __init__(self, n, train, pad_left=False, pad_right=False, | |
44 estimator=None, *estimator_args, **estimator_kwargs): | |
45 """ | |
46 Same as NgramModel (q.v.), but with a WittenBell default estimator | |
47 """ | |
48 if estimator is None: | |
49 assert (not(estimator_args)) and (not(estimator_kwargs)),\ | |
50 "estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs) | |
51 estimator=_estimator | |
52 super(LgramModel,self).__init__(n, train, pad_left, pad_right, | |
53 estimator, | |
54 *estimator_args, **estimator_kwargs) | |
55 | |
56 from nltk.probability import _get_kwarg | |
57 try: | |
58 from nltk.probability import islice | |
59 except: | |
60 from nltk.util import islice | |
61 | |
62 def plotSorted(self, *args, **kwargs): | |
63 """ | |
64 Plot samples from the frequency distribution, | |
65 sorted using a supplied key function. If an integer | |
66 parameter is supplied, stop after this many samples have been | |
67 plotted. If two integer parameters m, n are supplied, plot a | |
68 subset of the samples, beginning with m and stopping at n-1. | |
69 For a cumulative plot, specify cumulative=True. | |
70 (Requires Matplotlib to be installed.) | |
71 | |
72 :param title: The title for the graph | |
73 :type title: str | |
74 :param key: a function to pass to sort to extract the sort key | |
75 given an FD and a sample id. | |
76 Defaults to the value of that sample's entry, | |
77 lambda fd,s:fd[s] | |
78 :type key: function | |
79 :param reverse: True to sort high to low | |
80 :type reverse: bool | |
81 """ | |
82 try: | |
83 import pylab | |
84 except ImportError: | |
85 raise ValueError('The plot function requires the matplotlib package (aka pylab). ' | |
86 'See http://matplotlib.sourceforge.net/') | |
87 | |
88 if len(args) == 0: | |
89 args = [len(self)] | |
90 | |
91 keyFn = _get_kwarg(kwargs, 'key', lambda fd,s:fd[s]) | |
92 reverse = _get_kwarg(kwargs, 'reverse', False) | |
93 | |
94 samples = list(islice(self, *args)) | |
95 samples.sort(key=lambda x:keyFn(self,x),reverse=reverse) | |
96 | |
97 freqs = [self[sample] for sample in samples] | |
98 ylabel = "Counts" | |
99 # percents = [f * 100 for f in freqs] only in ProbDist? | |
100 | |
101 pylab.grid(True, color="silver") | |
102 if not "linewidth" in kwargs: | |
103 kwargs["linewidth"] = 2 | |
104 if "title" in kwargs: | |
105 pylab.title(kwargs["title"]) | |
106 del kwargs["title"] | |
107 pylab.plot(freqs, **kwargs) | |
108 pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90) | |
109 pylab.xlabel("Samples") | |
110 pylab.ylabel(ylabel) | |
111 pylab.show() | |
112 | |
113 FreqDist.plotSorted=plotSorted |