diff hmm/semiSup.py @ 2:e07789816ca5

adding more python files from lib/python on origen
author Henry Thompson <ht@markup.co.uk>
date Mon, 09 Mar 2020 16:48:09 +0000
parents
children 26d9c0308fcf
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hmm/semiSup.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,223 @@
+'''Exploring the claim that a small dictionary can seed
+an otherwise unsupervised HMM to learn a decent POS-tagger'''
+import nltk, random, itertools
+from nltk.corpus import brown
+from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer, logsumexp2
+from nltk.probability import FreqDist,ConditionalFreqDist
+from nltk.probability import MLEProbDist, RandomProbDist, DictionaryConditionalProbDist
+
+def totLogProb(self,sequences):
+  N = len(self._states)
+  M = len(self._symbols)
+  logProb = 0
+  for sequence in sequences:
+    T = len(sequence)
+    # compute forward and backward probabilities
+    alpha = self._forward_probability(sequence)
+    beta = self._backward_probability(sequence)
+    # find the log probability of the sequence
+    logProb += logsumexp2(alpha[T-1])
+  return logProb
+
+HiddenMarkovModelTagger.totLogProb=totLogProb
+
+trainTagsPercent=1.0
+trainHMMPercent=0.9
+knownWordsPercent=1.0
+
+SST=SSW='<s>'
+EST=ESW='</s>'
+SS=[(SSW,SST)]
+ES=[(ESW,EST)]
+TAGSETS={
+  'univ':
+  [u'ADJ', u'ADP', u'ADV', u'CONJ', u'DET', u'NOUN', u'NUM',
+   u'PRON', u'PRT', u'VERB', u'X', u'.',SST,EST],
+  'brown':
+  [u"ABL", u"ABN", u"ABX", u"AP", u"AP$", u"AP+AP", u"AT", u"BE",
+   u"BED", u"BED*", u"BEDZ", u"BEDZ*", u"BEG", u"BEM", u"BEM*",
+   u"BEN", u"BER", u"BER*", u"BEZ", u"BEZ*", u"CC", u"CD",
+   u"CD$", u"CS", u"DO", u"DO*", u"DO+PPSS", u"DOD", u"DOD*",
+   u"DOZ", u"DOZ*", u"DT", u"DT$", u"DT+BEZ", u"DT+MD", u"DTI",
+   u"DTS", u"DTS+BEZ", u"DTX", u"EX", u"EX+BEZ", u"EX+HVD", u"EX+HVZ",
+   u"EX+MD", u"FW-*", u"FW-AT", u"FW-AT+NN", u"FW-AT+NP", u"FW-BE", u"FW-BER",
+   u"FW-BEZ", u"FW-CC", u"FW-CD", u"FW-CS", u"FW-DT", u"FW-DT+BEZ", u"FW-DTS",
+   u"FW-HV", u"FW-IN", u"FW-IN+AT", u"FW-IN+NN", u"FW-IN+NP", u"FW-JJ",
+   u"FW-JJR", u"FW-JJT", u"FW-NN", u"FW-NN$", u"FW-NNS", u"FW-NP", u"FW-NPS",
+   u"FW-NR", u"FW-OD", u"FW-PN", u"FW-PP$", u"FW-PPL", u"FW-PPL+VBZ",
+   u"FW-PPO",  u"FW-PPO+IN", u"FW-PPS", u"FW-PPSS", u"FW-PPSS+HV", u"FW-QL",
+   u"FW-RB",  u"FW-RB+CC", u"FW-TO+VB", u"FW-UH", u"FW-VB", u"FW-VBD",
+   u"FW-VBG",  u"FW-VBN", u"FW-VBZ", u"FW-WDT", u"FW-WPO", u"FW-WPS", u"HV",
+   u"HV*",  u"HV+TO", u"HVD", u"HVD*", u"HVG", u"HVN", u"HVZ", u"HVZ*", u"IN",
+   u"IN+IN",  u"IN+PPO", u"JJ", u"JJ$", u"JJ+JJ", u"JJR", u"JJR+CS", u"JJS",
+   u"JJT",  u"MD", u"MD*", u"MD+HV", u"MD+PPSS", u"MD+TO", u"NN", u"NN$",
+   u"NN+BEZ", u"NN+HVD", u"NN+HVZ", u"NN+IN", u"NN+MD", u"NN+NN", u"NNS",
+   u"NNS$", u"NNS+MD", u"NP", u"NP$", u"NP+BEZ", u"NP+HVZ", u"NP+MD",
+   u"NPS", u"NPS$", u"NR", u"NR$", u"NR+MD", u"NRS", u"OD",
+   u"PN", u"PN$", u"PN+BEZ", u"PN+HVD", u"PN+HVZ", u"PN+MD", u"PP$",
+   u"PP$$", u"PPL", u"PPLS", u"PPO", u"PPS", u"PPS+BEZ", u"PPS+HVD",
+   u"PPS+HVZ", u"PPS+MD", u"PPSS", u"PPSS+BEM", u"PPSS+BER", u"PPSS+BEZ",
+   u"PPSS+BEZ*", u"PPSS+HV", u"PPSS+HVD", u"PPSS+MD", u"PPSS+VB", u"QL",
+   u"QLP",  u"RB", u"RB$", u"RB+BEZ", u"RB+CS", u"RBR", u"RBR+CS", u"RBT",
+   u"RN",  u"RP", u"RP+IN", u"TO", u"TO+VB", u"UH", u"VB", u"VB+AT",
+   u"VB+IN", u"VB+JJ", u"VB+PPO", u"VB+RP", u"VB+TO", u"VB+VB", u"VBD",
+   u"VBG", u"VBG+TO", u"VBN", u"VBN+TO", u"VBZ", u"WDT", u"WDT+BER",
+   u"WDT+BER+PP", u"WDT+BEZ", u"WDT+DO+PPS", u"WDT+DOD", u"WDT+HVZ", u"WP$",
+   u"WPO", u"WPS", u"WPS+BEZ", u"WPS+HVD", u"WPS+HVZ", u"WPS+MD", u"WQL",
+   u"WRB", u"WRB+BER", u"WRB+BEZ", u"WRB+DO", u"WRB+DOD", u"WRB+DOD*",
+   u"WRB+DOZ", u"WRB+IN", u"WRB+MD",
+   u"(", u")", u"*", u",", u"--", u".", u":"],
+  'upenn':
+  [u"CC", u"CD", u"DT", u"EX", u"FW", u"IN", u"JJ", u"JJR", u"JJS", u"LS",
+   u"MD", u"NN", u"NNP", u"NNPS", u"NNS", u"PDT", u"POS", u"PRP", u"PRP$",
+   u"RB", u"RBR", u"RBS", u"RP", u"SYM", u"TO", u"UH", u"VB", u"VBD", u"VBG",
+   u"VBN", u"VBP", u"VBZ", u"WDT", u"WP", u"WP$", u"WRB",
+   u"``", u"$", u"''", u"(", u")", u",", u"--", u".", u":"]}
+
+TAGSETS['universal']=TAGSETS['univ']
+TAGSETS['penn']=TAGSETS['upenn']
+
+def setup(cat='news',tagset='brown',corpus=brown):
+  return ([list(itertools.chain(iter(SS),
+                                ((word.lower(),tag) for (word,tag) in s)
+                                ,iter(ES)))
+           for s in corpus.tagged_sents(categories=cat,tagset=tagset)],
+          list(itertools.chain(iter(SS), iter(ES),
+                               ((word.lower(),tag) for (word,tag) in
+                                corpus.tagged_words(categories=cat,tagset=tagset)))),
+          TAGSETS[tagset])
+
+def notCurrent(s,missList):
+  global i,n,done
+  if done or (missList[i] is not s):
+    return True
+  else:
+    i+=1
+    if i==n:
+      done=True
+    return False
+
+def splitData(words,wordPercent,sentences,sentPercent):
+  global i,n, done
+  trainWords=random.sample(words,int(wordPercent*len(words)))
+  # random.sample(sentences,int(sentPercent*len(sentences)))
+  trainSents=[s for s in sentences if random.random()<sentPercent]
+  # hack!
+  i=0
+  n=len(trainSents)
+  done=False
+  testSents=[s for s in sentences if notCurrent(s,trainSents)]
+  return trainWords, trainSents, testSents
+
+def pickWords(tagged,percent):
+  #wToT=ConditionalFreqDist(tagged)
+  tToW=ConditionalFreqDist((t,w) for (w,t) in tagged)
+  #print len(tToW[u'ADV'])
+  dd=dict((tag,(lambda wl,p=percent:\
+                wl[:int(p*len(wl))])(
+             sorted(tToW[tag].items(),key=lambda (k,v):v,reverse=True)))
+          for tag in tToW.keys())
+  return dd
+
+(tagged_s,tagged_w,tagset)=setup(tagset='universal')
+
+true_tagged_w=tagged_w[2:] # not SS, SE
+
+wordTokens=FreqDist(word for word,tag in true_tagged_w)
+wordsAsSuch=list(wordTokens.keys())
+print len(wordTokens), wordTokens.N()
+
+(trainTags,trainHMM,testHMM)=splitData(true_tagged_w,trainTagsPercent,
+                                       tagged_s,trainHMMPercent)
+
+knownWords=pickWords(trainTags,knownWordsPercent)
+
+class SubsetFreqDist(FreqDist):
+  def __init__(self,pairs,baseset,basecount=.05):
+    dict.update(self,pairs)
+    self._baseset=baseset
+    self._basecount=basecount
+    pn=sum(n for w,n in pairs)
+    self._N=pn+((len(baseset)-len(pairs))*basecount)
+
+  def __getitem__(self,key):
+    return dict.__getitem__(self,key)
+
+  def __missing__(self,key):
+    if key in self._baseset:
+      return self._basecount
+    else:
+      return 0
+
+  def N(self):
+    return self._N
+
+class Tag:
+  def __init__(self,tag,wordsAndCounts):
+    self._tag=tag
+    self._wordsAndCounts=wordsAndCounts
+    self._words=set(w for w,n in wordsAndCounts)
+    self._nTokens=sum(n for w,n in wordsAndCounts)
+    self._nTypes=len(self._words)
+
+  def words(self):
+    return self._words
+
+  def buildPD(self,allTokens):
+    self._sfd=SubsetFreqDist(self._wordsAndCounts,allTokens)
+    self._pd=MLEProbDist(self._sfd)
+
+  def getSFD(self):
+    return self._sfd
+
+  def getPD(self):
+    return self._pd
+
+class FixedTag(Tag):
+  def buildPD(self):
+    self._pd=MLEProbDist(FreqDist(dict(self._wordsAndCounts)))
+
+  def getSFD(self):
+    raise NotImplementedError("not implemented for this subclass")
+
+tags=dict((tagName,Tag(tagName,wl)) for tagName,wl in knownWords.items())
+kws=dict((tagName,tag.words()) for tagName,tag in tags.items())
+
+t2=list(filter(None,
+               ((lambda i:False if not i[1] else i)
+                (((tagset[i],tagset[j]),
+                  kws[tagset[i]].intersection(kws[tagset[j]])),)
+                for i in xrange(0,len(tagset)-2)
+                for j in xrange(i+1,len(tagset)-2))))
+
+for tag in tags.values():
+  tag.buildPD(wordTokens)
+
+tags[SST]=FixedTag(SST,[(SSW,1)])
+tags[SST].buildPD()
+tags[EST]=FixedTag(EST,[(ESW,1)])
+tags[EST].buildPD()
+
+priors = MLEProbDist(FreqDist(dict((tag,1 if tag==SST else 0) for tag in tagset)))
+
+transitions = DictionaryConditionalProbDist(
+                dict((state, RandomProbDist(tagset))
+                      for state in tagset))
+
+outputs = DictionaryConditionalProbDist(
+                dict((state, tags[state].getPD())
+                      for state in tagset))
+
+model = HiddenMarkovModelTagger(wordsAsSuch, tagset,
+                transitions, outputs, priors)
+
+print "model", model.evaluate(testHMM), model.totLogProb(testHMM)
+
+nm=HiddenMarkovModelTrainer(states=tagset,symbols=wordsAsSuch)
+
+# Note that contrary to naive reading of the documentation,
+#  train_unsupervised expects a sequence of sequences of word/tag pairs,
+#  it just ignores the tags
+nnm=nm.train_unsupervised(trainHMM,True,model=model,max_iterations=10,testMe=testHMM)
+
+print nnm.totLogProb(testHMM)