Mercurial > hg > python
annotate hmm/semiSup.py @ 25:0bbeb01a7681
combine from various sources
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 29 May 2021 21:32:41 +0100 |
parents | 26d9c0308fcf |
children |
rev | line source |
---|---|
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 '''Exploring the claim that a small dictionary can seed |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
2 an otherwise unsupervised HMM to learn a decent POS-tagger''' |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
3 import nltk, random |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 from nltk.corpus import brown |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
5 from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
6 from nltk.probability import FreqDist,ConditionalFreqDist |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
7 from nltk.probability import MLEProbDist, RandomProbDist, DictionaryConditionalProbDist |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
9 trainTagsPercent=0.99 |
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
10 trainHMMPercent=0.9 |
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
11 knownWordsPercent=0.99 |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
12 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
13 TAGSETS={ |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
14 'univ': |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
15 [u'ADJ', u'ADP', u'ADV', u'CONJ', u'DET', u'NOUN', u'NUM', |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
16 u'PRON', u'PRT', u'VERB', u'X', u'.'], |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
17 'brown': |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
18 [u"ABL", u"ABN", u"ABX", u"AP", u"AP$", u"AP+AP", u"AT", u"BE", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
19 u"BED", u"BED*", u"BEDZ", u"BEDZ*", u"BEG", u"BEM", u"BEM*", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
20 u"BEN", u"BER", u"BER*", u"BEZ", u"BEZ*", u"CC", u"CD", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
21 u"CD$", u"CS", u"DO", u"DO*", u"DO+PPSS", u"DOD", u"DOD*", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
22 u"DOZ", u"DOZ*", u"DT", u"DT$", u"DT+BEZ", u"DT+MD", u"DTI", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
23 u"DTS", u"DTS+BEZ", u"DTX", u"EX", u"EX+BEZ", u"EX+HVD", u"EX+HVZ", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
24 u"EX+MD", u"FW-*", u"FW-AT", u"FW-AT+NN", u"FW-AT+NP", u"FW-BE", u"FW-BER", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
25 u"FW-BEZ", u"FW-CC", u"FW-CD", u"FW-CS", u"FW-DT", u"FW-DT+BEZ", u"FW-DTS", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
26 u"FW-HV", u"FW-IN", u"FW-IN+AT", u"FW-IN+NN", u"FW-IN+NP", u"FW-JJ", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
27 u"FW-JJR", u"FW-JJT", u"FW-NN", u"FW-NN$", u"FW-NNS", u"FW-NP", u"FW-NPS", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
28 u"FW-NR", u"FW-OD", u"FW-PN", u"FW-PP$", u"FW-PPL", u"FW-PPL+VBZ", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
29 u"FW-PPO", u"FW-PPO+IN", u"FW-PPS", u"FW-PPSS", u"FW-PPSS+HV", u"FW-QL", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
30 u"FW-RB", u"FW-RB+CC", u"FW-TO+VB", u"FW-UH", u"FW-VB", u"FW-VBD", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
31 u"FW-VBG", u"FW-VBN", u"FW-VBZ", u"FW-WDT", u"FW-WPO", u"FW-WPS", u"HV", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
32 u"HV*", u"HV+TO", u"HVD", u"HVD*", u"HVG", u"HVN", u"HVZ", u"HVZ*", u"IN", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
33 u"IN+IN", u"IN+PPO", u"JJ", u"JJ$", u"JJ+JJ", u"JJR", u"JJR+CS", u"JJS", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
34 u"JJT", u"MD", u"MD*", u"MD+HV", u"MD+PPSS", u"MD+TO", u"NN", u"NN$", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
35 u"NN+BEZ", u"NN+HVD", u"NN+HVZ", u"NN+IN", u"NN+MD", u"NN+NN", u"NNS", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
36 u"NNS$", u"NNS+MD", u"NP", u"NP$", u"NP+BEZ", u"NP+HVZ", u"NP+MD", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
37 u"NPS", u"NPS$", u"NR", u"NR$", u"NR+MD", u"NRS", u"OD", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
38 u"PN", u"PN$", u"PN+BEZ", u"PN+HVD", u"PN+HVZ", u"PN+MD", u"PP$", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
39 u"PP$$", u"PPL", u"PPLS", u"PPO", u"PPS", u"PPS+BEZ", u"PPS+HVD", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
40 u"PPS+HVZ", u"PPS+MD", u"PPSS", u"PPSS+BEM", u"PPSS+BER", u"PPSS+BEZ", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
41 u"PPSS+BEZ*", u"PPSS+HV", u"PPSS+HVD", u"PPSS+MD", u"PPSS+VB", u"QL", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
42 u"QLP", u"RB", u"RB$", u"RB+BEZ", u"RB+CS", u"RBR", u"RBR+CS", u"RBT", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
43 u"RN", u"RP", u"RP+IN", u"TO", u"TO+VB", u"UH", u"VB", u"VB+AT", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
44 u"VB+IN", u"VB+JJ", u"VB+PPO", u"VB+RP", u"VB+TO", u"VB+VB", u"VBD", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
45 u"VBG", u"VBG+TO", u"VBN", u"VBN+TO", u"VBZ", u"WDT", u"WDT+BER", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
46 u"WDT+BER+PP", u"WDT+BEZ", u"WDT+DO+PPS", u"WDT+DOD", u"WDT+HVZ", u"WP$", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
47 u"WPO", u"WPS", u"WPS+BEZ", u"WPS+HVD", u"WPS+HVZ", u"WPS+MD", u"WQL", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
48 u"WRB", u"WRB+BER", u"WRB+BEZ", u"WRB+DO", u"WRB+DOD", u"WRB+DOD*", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
49 u"WRB+DOZ", u"WRB+IN", u"WRB+MD", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
50 u"(", u")", u"*", u",", u"--", u".", u":"], |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
51 'upenn': |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
52 [u"CC", u"CD", u"DT", u"EX", u"FW", u"IN", u"JJ", u"JJR", u"JJS", u"LS", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
53 u"MD", u"NN", u"NNP", u"NNPS", u"NNS", u"PDT", u"POS", u"PRP", u"PRP$", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
54 u"RB", u"RBR", u"RBS", u"RP", u"SYM", u"TO", u"UH", u"VB", u"VBD", u"VBG", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
55 u"VBN", u"VBP", u"VBZ", u"WDT", u"WP", u"WP$", u"WRB", |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
56 u"``", u"$", u"''", u"(", u")", u",", u"--", u".", u":"]} |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
57 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
58 TAGSETS['universal']=TAGSETS['univ'] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
59 TAGSETS['penn']=TAGSETS['upenn'] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
60 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
61 def setup(cat='news',tagset='brown',corpus=brown): |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
62 return ([[(word.lower(),tag) for (word,tag) in s] |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
63 for s in corpus.tagged_sents(categories=cat,tagset=tagset)], |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
64 [(word.lower(),tag) for (word,tag) in corpus.tagged_words(categories=cat,tagset=tagset)], |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
65 TAGSETS[tagset]) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
66 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
67 def notCurrent(s,missList): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
68 global i,n,done |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
69 if done or (missList[i] is not s): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
70 return True |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
71 else: |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
72 i+=1 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
73 if i==n: |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
74 done=True |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
75 return False |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
76 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
77 def splitData(words,wordPercent,sentences,sentPercent): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
78 global i,n, done |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
79 trainWords=random.sample(words,int(wordPercent*len(words))) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
80 # random.sample(sentences,int(sentPercent*len(sentences))) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
81 trainSents=[s for s in sentences if random.random()<sentPercent] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
82 # hack! |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
83 i=0 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
84 n=len(trainSents) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
85 done=False |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
86 testSents=[s for s in sentences if notCurrent(s,trainSents)] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
87 return trainWords, trainSents, testSents |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
88 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
89 def pickWords(tagged,percent): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
90 #wToT=ConditionalFreqDist(tagged) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
91 tToW=ConditionalFreqDist((t,w) for (w,t) in tagged) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
92 #print len(tToW[u'ADV']) |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
93 return dict((tag,(lambda wl,p=percent:\ |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
94 wl[:int(p*len(wl))])( |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
95 sorted(tToW[tag].items(),key=lambda (k,v):v,reverse=True))) |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
96 for tag in tToW.keys()) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
97 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
98 (tagged_s,tagged_w,tagset)=setup(tagset='universal') |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
99 |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
100 wordTokens=FreqDist(word for word,tag in tagged_w) |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
101 print len(wordTokens), wordTokens.N() |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
102 |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
103 (trainTags,trainHMM,testHMM)=splitData(tagged_w,trainTagsPercent, |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
104 tagged_s,trainHMMPercent) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
105 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
106 knownWords=pickWords(trainTags,knownWordsPercent) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
107 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
108 class SubsetFreqDist(FreqDist): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
109 def __init__(self,pairs,baseset,basecount=.05): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
110 dict.update(self,pairs) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
111 self._baseset=baseset |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
112 self._basecount=basecount |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
113 pn=sum(n for w,n in pairs) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
114 self._N=pn+((len(baseset)-len(pairs))*basecount) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
115 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
116 def __getitem__(self,key): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
117 return dict.__getitem__(self,key) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
118 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
119 def __missing__(self,key): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
120 if key in self._baseset: |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
121 return self._basecount |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
122 else: |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
123 return 0 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
124 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
125 def N(self): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
126 return self._N |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
127 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
128 class Tag: |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
129 def __init__(self,tag,wordsAndCounts): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
130 self._tag=tag |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
131 self._wordsAndCounts=wordsAndCounts |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
132 self._words=set(w for w,n in wordsAndCounts) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
133 self._nTokens=sum(n for w,n in wordsAndCounts) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
134 self._nTypes=len(self._words) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
135 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
136 def words(self): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
137 return self._words |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
138 |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
139 def buildPD(self,tokens): |
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
140 self._sfd=SubsetFreqDist(self._wordsAndCounts,tokens) |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
141 self._pd=MLEProbDist(self._sfd) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
142 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
143 def getSFD(self): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
144 return self._sfd |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
145 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
146 def getPD(self): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
147 return self._pd |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
148 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
149 tags=dict((tagName,Tag(tagName,wl)) for tagName,wl in knownWords.items()) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
150 kws=dict((tagName,tag.words()) for tagName,tag in tags.items()) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
151 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
152 t2=list(filter(None, |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
153 ((lambda i:False if not i[1] else i) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
154 (((tagset[i],tagset[j]), |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
155 kws[tagset[i]].intersection(kws[tagset[j]])),) |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
156 for i in xrange(0,len(tagset)) |
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
157 for j in xrange(i+1,len(tagset))))) |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
158 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
159 for tag in tags.values(): |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
160 tag.buildPD(wordTokens) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
161 |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
162 priors = RandomProbDist(tagset) |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
163 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
164 transitions = DictionaryConditionalProbDist( |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
165 dict((state, RandomProbDist(tagset)) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
166 for state in tagset)) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
167 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
168 outputs = DictionaryConditionalProbDist( |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
169 dict((state, tags[state].getPD()) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
170 for state in tagset)) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
171 |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
172 model = HiddenMarkovModelTagger(wordTokens, tagset, |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
173 transitions, outputs, priors) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
174 |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
175 print model.evaluate(testHMM) |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
176 |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
177 nm=HiddenMarkovModelTrainer(states=tagset,symbols=wordTokens) |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
178 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
179 # Note that contrary to naive reading of the documentation, |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
180 # train_unsupervised expects a sequence of sequences of word/tag pairs, |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
181 # it just ignores the tags |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
182 nnm=nm.train_unsupervised(trainHMM,model=model,max_iterations=15,updateOutputs=False) |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
183 |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
184 print nnm.evaluate(testHMM) |