Mercurial > hg > python
annotate hmm/tinySup.py @ 64:fff2fa031ed7
regexp finished, I hope
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 14 Dec 2023 11:43:44 +0000 |
parents | 26d9c0308fcf |
children |
rev | line source |
---|---|
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 '''Trivial test of unsupervised learning with full dictionary supplied |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
2 See fnlp/lectures/12/hmmDNV.xlsx''' |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
3 import nltk, random |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 from nltk.probability import FreqDist,ConditionalFreqDist |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
6 from nltk.probability import MLEProbDist, RandomProbDist, DictionaryConditionalProbDist |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
7 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 tagset=['<s>','D','N','V','</s>'] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 symbols=['<s>','the','sheep','run','</s>'] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
10 sents=[[('<s>','<s>'),('the','D'),('sheep','N'),('run','V'),('</s>','</s>')], |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
11 [('<s>','<s>'),('sheep','N'),('run','V'),('the','D'),('sheep','N'),('</s>','</s>')], |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
12 [('<s>','<s>'),('run','V'),('the','D'),('sheep','N'),('</s>','</s>')]] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
13 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
14 taglists=[('<s>',[('<s>',1),('the',0),('sheep',0),('run',0),('</s>',0)]), |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
15 ('D',[('the',.8),('sheep',.1),('run',.1),('<s>',0),('</s>',0)]), |
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
16 ('N',[('the',.2),('sheep',.4),('run',.4),('<s>',0),('</s>',0)]), |
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
17 ('V',[('the',.2),('sheep',.4),('run',.4),('<s>',0),('</s>',0)]), |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
18 ('</s>',[('<s>',0),('the',0),('sheep',0),('run',0),('</s>',1)])] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
19 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
20 tagdict=dict((k,MLEProbDist(FreqDist(dict(v)))) for k,v in taglists) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
21 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
22 priors = MLEProbDist(FreqDist({'<s>':1, |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
23 'D':0, |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
24 'N':0, |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
25 'V':0, |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
26 '</s>':0})) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
27 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
28 transitions = DictionaryConditionalProbDist( |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
29 dict((state, RandomProbDist(tagset)) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
30 for state in tagset)) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
31 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
32 outputs = DictionaryConditionalProbDist(tagdict) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
33 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
34 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
35 for tag in tagset: |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
36 cp=outputs[tag] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
37 print tag,sum(cp.prob(s) for s in symbols) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
38 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
39 model = HiddenMarkovModelTagger(symbols, tagset, |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
40 transitions, outputs, priors) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
41 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
42 for tag in tagset: |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
43 cp=model._outputs[tag] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
44 print tag,sum(cp.prob(s) for s in symbols) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
45 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
46 nm=HiddenMarkovModelTrainer(states=tagset,symbols=symbols) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
47 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
48 # Note that contrary to naive reading of the documentation, |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
49 # train_unsupervised expects a sequence of sequences of word/tag pairs, |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
50 # it just ignores the tags |
3
26d9c0308fcf
updated/added from ecclerig version
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
51 nnm=nm.train_unsupervised(sents,model=model,max_iterations=15,updateOutputs=False) |
2
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
52 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
53 for tag in tagset: |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
54 if tag=='</s>': |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
55 break |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
56 cp=nnm._transitions[tag] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
57 print((" "+4*"%6s")%tuple(tagset[1:])) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
58 print(("%3s: "+4*"%6.3f")%tuple([tag]+[cp.prob(s) for s in tagset[1:]])) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
59 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
60 for tag in tagset: |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
61 cp=nnm._outputs[tag] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
62 print((" "+5*"%6s")%tuple(symbols)) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
63 x=[cp.prob(s) for s in symbols] |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
64 print(("%3s: "+5*"%6.3f"+"%11.4e")%tuple([tag]+x+[sum(x)])) |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
65 |
e07789816ca5
adding more python files from lib/python on origen
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
66 print nnm.evaluate(sents) |