annotate hmm/tinySup.py @ 25:0bbeb01a7681

combine from various sources
author Henry Thompson <ht@markup.co.uk>
date Sat, 29 May 2021 21:32:41 +0100
parents 26d9c0308fcf
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
1 '''Trivial test of unsupervised learning with full dictionary supplied
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
2 See fnlp/lectures/12/hmmDNV.xlsx'''
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
3 import nltk, random
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
4 from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
5 from nltk.probability import FreqDist,ConditionalFreqDist
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
6 from nltk.probability import MLEProbDist, RandomProbDist, DictionaryConditionalProbDist
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
7
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
8 tagset=['<s>','D','N','V','</s>']
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
9 symbols=['<s>','the','sheep','run','</s>']
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
10 sents=[[('<s>','<s>'),('the','D'),('sheep','N'),('run','V'),('</s>','</s>')],
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
11 [('<s>','<s>'),('sheep','N'),('run','V'),('the','D'),('sheep','N'),('</s>','</s>')],
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
12 [('<s>','<s>'),('run','V'),('the','D'),('sheep','N'),('</s>','</s>')]]
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
13
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
14 taglists=[('<s>',[('<s>',1),('the',0),('sheep',0),('run',0),('</s>',0)]),
3
26d9c0308fcf updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
15 ('D',[('the',.8),('sheep',.1),('run',.1),('<s>',0),('</s>',0)]),
26d9c0308fcf updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
16 ('N',[('the',.2),('sheep',.4),('run',.4),('<s>',0),('</s>',0)]),
26d9c0308fcf updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
17 ('V',[('the',.2),('sheep',.4),('run',.4),('<s>',0),('</s>',0)]),
2
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
18 ('</s>',[('<s>',0),('the',0),('sheep',0),('run',0),('</s>',1)])]
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
19
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
20 tagdict=dict((k,MLEProbDist(FreqDist(dict(v)))) for k,v in taglists)
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
21
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
22 priors = MLEProbDist(FreqDist({'<s>':1,
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
23 'D':0,
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
24 'N':0,
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
25 'V':0,
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
26 '</s>':0}))
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
27
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
28 transitions = DictionaryConditionalProbDist(
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
29 dict((state, RandomProbDist(tagset))
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
30 for state in tagset))
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
31
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
32 outputs = DictionaryConditionalProbDist(tagdict)
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
33
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
34
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
35 for tag in tagset:
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
36 cp=outputs[tag]
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
37 print tag,sum(cp.prob(s) for s in symbols)
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
38
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
39 model = HiddenMarkovModelTagger(symbols, tagset,
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
40 transitions, outputs, priors)
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
41
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
42 for tag in tagset:
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
43 cp=model._outputs[tag]
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
44 print tag,sum(cp.prob(s) for s in symbols)
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
45
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
46 nm=HiddenMarkovModelTrainer(states=tagset,symbols=symbols)
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
47
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
48 # Note that contrary to naive reading of the documentation,
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
49 # train_unsupervised expects a sequence of sequences of word/tag pairs,
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
50 # it just ignores the tags
3
26d9c0308fcf updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
51 nnm=nm.train_unsupervised(sents,model=model,max_iterations=15,updateOutputs=False)
2
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
52
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
53 for tag in tagset:
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
54 if tag=='</s>':
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
55 break
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
56 cp=nnm._transitions[tag]
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
57 print((" "+4*"%6s")%tuple(tagset[1:]))
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
58 print(("%3s: "+4*"%6.3f")%tuple([tag]+[cp.prob(s) for s in tagset[1:]]))
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
59
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
60 for tag in tagset:
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
61 cp=nnm._outputs[tag]
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
62 print((" "+5*"%6s")%tuple(symbols))
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
63 x=[cp.prob(s) for s in symbols]
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
64 print(("%3s: "+5*"%6.3f"+"%11.4e")%tuple([tag]+x+[sum(x)]))
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
65
e07789816ca5 adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
66 print nnm.evaluate(sents)