Mercurial > hg > python
diff hmm/tinySup.py @ 2:e07789816ca5
adding more python files from lib/python on origen
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Mon, 09 Mar 2020 16:48:09 +0000 |
parents | |
children | 26d9c0308fcf |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hmm/tinySup.py Mon Mar 09 16:48:09 2020 +0000 @@ -0,0 +1,66 @@ +'''Trivial test of unsupervised learning with full dictionary supplied +See fnlp/lectures/12/hmmDNV.xlsx''' +import nltk, random +from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer +from nltk.probability import FreqDist,ConditionalFreqDist +from nltk.probability import MLEProbDist, RandomProbDist, DictionaryConditionalProbDist + +tagset=['<s>','D','N','V','</s>'] +symbols=['<s>','the','sheep','run','</s>'] +sents=[[('<s>','<s>'),('the','D'),('sheep','N'),('run','V'),('</s>','</s>')], + [('<s>','<s>'),('sheep','N'),('run','V'),('the','D'),('sheep','N'),('</s>','</s>')], + [('<s>','<s>'),('run','V'),('the','D'),('sheep','N'),('</s>','</s>')]] + +taglists=[('<s>',[('<s>',1),('the',0),('sheep',0),('run',0),('</s>',0)]), + ('D',[('the',1),('sheep',0),('run',0),('<s>',0),('</s>',0)]), + ('N',[('the',0),('sheep',.5),('run',.5),('<s>',0),('</s>',0)]), + ('V',[('the',0),('sheep',.5),('run',.5),('<s>',0),('</s>',0)]), + ('</s>',[('<s>',0),('the',0),('sheep',0),('run',0),('</s>',1)])] + +tagdict=dict((k,MLEProbDist(FreqDist(dict(v)))) for k,v in taglists) + +priors = MLEProbDist(FreqDist({'<s>':1, + 'D':0, + 'N':0, + 'V':0, + '</s>':0})) + +transitions = DictionaryConditionalProbDist( + dict((state, RandomProbDist(tagset)) + for state in tagset)) + +outputs = DictionaryConditionalProbDist(tagdict) + + +for tag in tagset: + cp=outputs[tag] + print tag,sum(cp.prob(s) for s in symbols) + +model = HiddenMarkovModelTagger(symbols, tagset, + transitions, outputs, priors) + +for tag in tagset: + cp=model._outputs[tag] + print tag,sum(cp.prob(s) for s in symbols) + +nm=HiddenMarkovModelTrainer(states=tagset,symbols=symbols) + +# Note that contrary to naive reading of the documentation, +# train_unsupervised expects a sequence of sequences of word/tag pairs, +# it just ignores the tags +nnm=nm.train_unsupervised(sents,model=model,max_iterations=10,updateOutputs=False) + +for tag in tagset: + if tag=='</s>': + break + cp=nnm._transitions[tag] + print((" "+4*"%6s")%tuple(tagset[1:])) + print(("%3s: "+4*"%6.3f")%tuple([tag]+[cp.prob(s) for s in tagset[1:]])) + +for tag in tagset: + cp=nnm._outputs[tag] + print((" "+5*"%6s")%tuple(symbols)) + x=[cp.prob(s) for s in symbols] + print(("%3s: "+5*"%6.3f"+"%11.4e")%tuple([tag]+x+[sum(x)])) + +print nnm.evaluate(sents)