Mercurial > hg > python
diff hmm/semiSup.py @ 2:e07789816ca5
adding more python files from lib/python on origen
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Mon, 09 Mar 2020 16:48:09 +0000 |
parents | |
children | 26d9c0308fcf |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hmm/semiSup.py Mon Mar 09 16:48:09 2020 +0000 @@ -0,0 +1,223 @@ +'''Exploring the claim that a small dictionary can seed +an otherwise unsupervised HMM to learn a decent POS-tagger''' +import nltk, random, itertools +from nltk.corpus import brown +from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer, logsumexp2 +from nltk.probability import FreqDist,ConditionalFreqDist +from nltk.probability import MLEProbDist, RandomProbDist, DictionaryConditionalProbDist + +def totLogProb(self,sequences): + N = len(self._states) + M = len(self._symbols) + logProb = 0 + for sequence in sequences: + T = len(sequence) + # compute forward and backward probabilities + alpha = self._forward_probability(sequence) + beta = self._backward_probability(sequence) + # find the log probability of the sequence + logProb += logsumexp2(alpha[T-1]) + return logProb + +HiddenMarkovModelTagger.totLogProb=totLogProb + +trainTagsPercent=1.0 +trainHMMPercent=0.9 +knownWordsPercent=1.0 + +SST=SSW='<s>' +EST=ESW='</s>' +SS=[(SSW,SST)] +ES=[(ESW,EST)] +TAGSETS={ + 'univ': + [u'ADJ', u'ADP', u'ADV', u'CONJ', u'DET', u'NOUN', u'NUM', + u'PRON', u'PRT', u'VERB', u'X', u'.',SST,EST], + 'brown': + [u"ABL", u"ABN", u"ABX", u"AP", u"AP$", u"AP+AP", u"AT", u"BE", + u"BED", u"BED*", u"BEDZ", u"BEDZ*", u"BEG", u"BEM", u"BEM*", + u"BEN", u"BER", u"BER*", u"BEZ", u"BEZ*", u"CC", u"CD", + u"CD$", u"CS", u"DO", u"DO*", u"DO+PPSS", u"DOD", u"DOD*", + u"DOZ", u"DOZ*", u"DT", u"DT$", u"DT+BEZ", u"DT+MD", u"DTI", + u"DTS", u"DTS+BEZ", u"DTX", u"EX", u"EX+BEZ", u"EX+HVD", u"EX+HVZ", + u"EX+MD", u"FW-*", u"FW-AT", u"FW-AT+NN", u"FW-AT+NP", u"FW-BE", u"FW-BER", + u"FW-BEZ", u"FW-CC", u"FW-CD", u"FW-CS", u"FW-DT", u"FW-DT+BEZ", u"FW-DTS", + u"FW-HV", u"FW-IN", u"FW-IN+AT", u"FW-IN+NN", u"FW-IN+NP", u"FW-JJ", + u"FW-JJR", u"FW-JJT", u"FW-NN", u"FW-NN$", u"FW-NNS", u"FW-NP", u"FW-NPS", + u"FW-NR", u"FW-OD", u"FW-PN", u"FW-PP$", u"FW-PPL", u"FW-PPL+VBZ", + u"FW-PPO", u"FW-PPO+IN", u"FW-PPS", u"FW-PPSS", u"FW-PPSS+HV", u"FW-QL", + u"FW-RB", u"FW-RB+CC", u"FW-TO+VB", u"FW-UH", u"FW-VB", u"FW-VBD", + u"FW-VBG", u"FW-VBN", u"FW-VBZ", u"FW-WDT", u"FW-WPO", u"FW-WPS", u"HV", + u"HV*", u"HV+TO", u"HVD", u"HVD*", u"HVG", u"HVN", u"HVZ", u"HVZ*", u"IN", + u"IN+IN", u"IN+PPO", u"JJ", u"JJ$", u"JJ+JJ", u"JJR", u"JJR+CS", u"JJS", + u"JJT", u"MD", u"MD*", u"MD+HV", u"MD+PPSS", u"MD+TO", u"NN", u"NN$", + u"NN+BEZ", u"NN+HVD", u"NN+HVZ", u"NN+IN", u"NN+MD", u"NN+NN", u"NNS", + u"NNS$", u"NNS+MD", u"NP", u"NP$", u"NP+BEZ", u"NP+HVZ", u"NP+MD", + u"NPS", u"NPS$", u"NR", u"NR$", u"NR+MD", u"NRS", u"OD", + u"PN", u"PN$", u"PN+BEZ", u"PN+HVD", u"PN+HVZ", u"PN+MD", u"PP$", + u"PP$$", u"PPL", u"PPLS", u"PPO", u"PPS", u"PPS+BEZ", u"PPS+HVD", + u"PPS+HVZ", u"PPS+MD", u"PPSS", u"PPSS+BEM", u"PPSS+BER", u"PPSS+BEZ", + u"PPSS+BEZ*", u"PPSS+HV", u"PPSS+HVD", u"PPSS+MD", u"PPSS+VB", u"QL", + u"QLP", u"RB", u"RB$", u"RB+BEZ", u"RB+CS", u"RBR", u"RBR+CS", u"RBT", + u"RN", u"RP", u"RP+IN", u"TO", u"TO+VB", u"UH", u"VB", u"VB+AT", + u"VB+IN", u"VB+JJ", u"VB+PPO", u"VB+RP", u"VB+TO", u"VB+VB", u"VBD", + u"VBG", u"VBG+TO", u"VBN", u"VBN+TO", u"VBZ", u"WDT", u"WDT+BER", + u"WDT+BER+PP", u"WDT+BEZ", u"WDT+DO+PPS", u"WDT+DOD", u"WDT+HVZ", u"WP$", + u"WPO", u"WPS", u"WPS+BEZ", u"WPS+HVD", u"WPS+HVZ", u"WPS+MD", u"WQL", + u"WRB", u"WRB+BER", u"WRB+BEZ", u"WRB+DO", u"WRB+DOD", u"WRB+DOD*", + u"WRB+DOZ", u"WRB+IN", u"WRB+MD", + u"(", u")", u"*", u",", u"--", u".", u":"], + 'upenn': + [u"CC", u"CD", u"DT", u"EX", u"FW", u"IN", u"JJ", u"JJR", u"JJS", u"LS", + u"MD", u"NN", u"NNP", u"NNPS", u"NNS", u"PDT", u"POS", u"PRP", u"PRP$", + u"RB", u"RBR", u"RBS", u"RP", u"SYM", u"TO", u"UH", u"VB", u"VBD", u"VBG", + u"VBN", u"VBP", u"VBZ", u"WDT", u"WP", u"WP$", u"WRB", + u"``", u"$", u"''", u"(", u")", u",", u"--", u".", u":"]} + +TAGSETS['universal']=TAGSETS['univ'] +TAGSETS['penn']=TAGSETS['upenn'] + +def setup(cat='news',tagset='brown',corpus=brown): + return ([list(itertools.chain(iter(SS), + ((word.lower(),tag) for (word,tag) in s) + ,iter(ES))) + for s in corpus.tagged_sents(categories=cat,tagset=tagset)], + list(itertools.chain(iter(SS), iter(ES), + ((word.lower(),tag) for (word,tag) in + corpus.tagged_words(categories=cat,tagset=tagset)))), + TAGSETS[tagset]) + +def notCurrent(s,missList): + global i,n,done + if done or (missList[i] is not s): + return True + else: + i+=1 + if i==n: + done=True + return False + +def splitData(words,wordPercent,sentences,sentPercent): + global i,n, done + trainWords=random.sample(words,int(wordPercent*len(words))) + # random.sample(sentences,int(sentPercent*len(sentences))) + trainSents=[s for s in sentences if random.random()<sentPercent] + # hack! + i=0 + n=len(trainSents) + done=False + testSents=[s for s in sentences if notCurrent(s,trainSents)] + return trainWords, trainSents, testSents + +def pickWords(tagged,percent): + #wToT=ConditionalFreqDist(tagged) + tToW=ConditionalFreqDist((t,w) for (w,t) in tagged) + #print len(tToW[u'ADV']) + dd=dict((tag,(lambda wl,p=percent:\ + wl[:int(p*len(wl))])( + sorted(tToW[tag].items(),key=lambda (k,v):v,reverse=True))) + for tag in tToW.keys()) + return dd + +(tagged_s,tagged_w,tagset)=setup(tagset='universal') + +true_tagged_w=tagged_w[2:] # not SS, SE + +wordTokens=FreqDist(word for word,tag in true_tagged_w) +wordsAsSuch=list(wordTokens.keys()) +print len(wordTokens), wordTokens.N() + +(trainTags,trainHMM,testHMM)=splitData(true_tagged_w,trainTagsPercent, + tagged_s,trainHMMPercent) + +knownWords=pickWords(trainTags,knownWordsPercent) + +class SubsetFreqDist(FreqDist): + def __init__(self,pairs,baseset,basecount=.05): + dict.update(self,pairs) + self._baseset=baseset + self._basecount=basecount + pn=sum(n for w,n in pairs) + self._N=pn+((len(baseset)-len(pairs))*basecount) + + def __getitem__(self,key): + return dict.__getitem__(self,key) + + def __missing__(self,key): + if key in self._baseset: + return self._basecount + else: + return 0 + + def N(self): + return self._N + +class Tag: + def __init__(self,tag,wordsAndCounts): + self._tag=tag + self._wordsAndCounts=wordsAndCounts + self._words=set(w for w,n in wordsAndCounts) + self._nTokens=sum(n for w,n in wordsAndCounts) + self._nTypes=len(self._words) + + def words(self): + return self._words + + def buildPD(self,allTokens): + self._sfd=SubsetFreqDist(self._wordsAndCounts,allTokens) + self._pd=MLEProbDist(self._sfd) + + def getSFD(self): + return self._sfd + + def getPD(self): + return self._pd + +class FixedTag(Tag): + def buildPD(self): + self._pd=MLEProbDist(FreqDist(dict(self._wordsAndCounts))) + + def getSFD(self): + raise NotImplementedError("not implemented for this subclass") + +tags=dict((tagName,Tag(tagName,wl)) for tagName,wl in knownWords.items()) +kws=dict((tagName,tag.words()) for tagName,tag in tags.items()) + +t2=list(filter(None, + ((lambda i:False if not i[1] else i) + (((tagset[i],tagset[j]), + kws[tagset[i]].intersection(kws[tagset[j]])),) + for i in xrange(0,len(tagset)-2) + for j in xrange(i+1,len(tagset)-2)))) + +for tag in tags.values(): + tag.buildPD(wordTokens) + +tags[SST]=FixedTag(SST,[(SSW,1)]) +tags[SST].buildPD() +tags[EST]=FixedTag(EST,[(ESW,1)]) +tags[EST].buildPD() + +priors = MLEProbDist(FreqDist(dict((tag,1 if tag==SST else 0) for tag in tagset))) + +transitions = DictionaryConditionalProbDist( + dict((state, RandomProbDist(tagset)) + for state in tagset)) + +outputs = DictionaryConditionalProbDist( + dict((state, tags[state].getPD()) + for state in tagset)) + +model = HiddenMarkovModelTagger(wordsAsSuch, tagset, + transitions, outputs, priors) + +print "model", model.evaluate(testHMM), model.totLogProb(testHMM) + +nm=HiddenMarkovModelTrainer(states=tagset,symbols=wordsAsSuch) + +# Note that contrary to naive reading of the documentation, +# train_unsupervised expects a sequence of sequences of word/tag pairs, +# it just ignores the tags +nnm=nm.train_unsupervised(trainHMM,True,model=model,max_iterations=10,testMe=testHMM) + +print nnm.totLogProb(testHMM)